Written by

Question Archunan K · Apr 12, 2019

Conversion of RTF document to plain Text/HTML document using %iKnow.Source.Converter

#Caché #InterSystems Natural Language Processing (NLP, iKnow) #Beginner

Hello All,

Is ther any way to convert RTF content into Text document/HTML docucment in Cache object scripts or by using %iKnow.Source.Converter.

Please advise.

Thanks,

Archunan K

Discussion (3)0

Add reply

Comments

Eduard Lebedyuk · Apr 12, 2019

Not sure about iKnow, but check the article Converting documents with Caché and LibreOffice.

0 0

Benjamin De Boe · Apr 15, 2019

No, iKnow doesn't extract text from RTF prior to its NLP task. Besides the LibreOffice suggestion, I've also heard people who've worked with simple Java RTF extractors (part of regular JDK) and Tika in the past.

0 0

Stuart Strickland · Mar 21, 2025

Hi,

I knocked up some code to extract the plain text from an RTF document. It works for my purposes but would like to know if anyone can find a case where it does a bad job.

Copy your .rtf file into a flat array, eg x(1)=first line, x(2)=second line then:

d ##class(yourclass).StripRTF(.x,.y)

and you'll get the plain text in y

/// accepts an RTF doc in array form, doesn't care if lines are split across array members
/// returns plain text from the doc with one array item per line 
/// quit value is number of lines in array
ClassMethod StripRTF(ByRef rtfText = "", ByRef %plainText) As %Integer
{
 // use this code to view an rtf doc split into groups and indented when group level goes up
 // S IN=0,C=0 F I=1:1 S C=##class(ZSS.SMTP).EXTRACT(.x,I) Q:C="" I C="{" {S IN=IN+1 W !?IN*2,"(",IN,")"} W C I C="}" {W "(",IN,")" S IN=IN-1 W !?IN*2}
 n %line
 kill %plainText Set %line=0
 // you could add some speed here by working out how to set characters in the R array to null and
 // reducing the i pointer accordingly
 m R=rtfText
 d ..Brace(.R,$i(i))
 q %line
}

/// A bit like $Extract but the first argument is a single level array, like array(1)="some text", array(2)="some more text"
/// ..EXTRACT(.array,9,11) would return "tso" as the 9th, 10th, and 11th characters of the data in the array
ClassMethod EXTRACT(ByRef array, from As %Integer, to As %Integer, SetToNull = 0) As %String
{
 i '$d(to) s to=from
 S UncleLen=from-to+1 // length of string to return
 s string=""
 f index=1:1 {
  q:'$d(array(index))
  s short=array(index)
  i from'>$l(short) {
   s string=string_$e(short,from,to)
   // mimic SET $EXTRACT
   I SetToNull S $e(array(index),from,to)=""
   q:$l(string)=UncleLen
  }
  s from=from-$l(short),to=to-$l(short)
  i from<0 s from=0
 }
 q string
}
/// discard everything between two braces including the braces
/// done by either moving i to the last brace
/// or if SetToNull is passed as 1 then actually removing the characters from the RTF array
ClassMethod Discard(ByRef rtfText, ByRef i, SetToNull = 0)
{
 s inGroup=1,start=i,stop=i,discarded=""
 f  {
  s discard=..EXTRACT(.rtfText,$i(i))
  q:discard=""
  i discard="}" {
   s inGroup=inGroup-1
  }
  i discard="{" s inGroup=inGroup+1
 q
 }
}
i SetToNull d ..EXTRACT(.rtfText,start,i,1)
}
/// return the contents of a Slash
ClassMethod Slash(ByRef R, ByRef i) As %String
{
 s string="\"
 F  {
  Set char=..EXTRACT(.R, $I(i))
  q:char="" // should never happen but don't want to get stuck in a loop because of a bad file
  i char="}" s i=i-1 q
  i char=" " q
  i char="\" s i=i-1 q
  i char="{" {
   i string?1"\"1a.an {
    // everything inside something like \abc1{this stuff}
    // example {\fonttbl{\fprq2{02020603050405020304}TimesNew Roman;} should all be discarded
    d ..Discard(.R,.i)
    s string=$c(127)
    q
   }
  s char=..Brace(.R,.i)
  }
  i char="*",..EXTRACT(.R, i-2,i+1)="{\*\" {
   // "{\*\" at the start of a brace means the whole brace including nested braces can be ignored
   d ..Discard(.R,.i)
   s string=$c(127)
   q
  }
  i char="'" {
   s string=string_char_..EXTRACT(.R, $I(i),$I(i))
   q
  }
  s string=string_char
  // everything in here should disappear because it looks like {\stylesheet{
  i string="\stylesheet"!(string="\info") {
   d ..Discard(.R,.i)
   s string=$c(127)
   q
  }
 }
 //
 // add stuff here for special characters represented by \codeword
 // you could put them on a global
 //
 i string'="" {
  try {
  // catching stupid subscripts
  s string=$G(^GMAT("RTF","special characters",string),string)
  }
  catch {
   // do nowt
  }
 }
 i string="\lquote" s string="'"
 i string="\rquote" s string="'"
 i string="\ldblquote" s string=""""
 i string="\rdblquote" s string=""""
 i string="\'93" s string=""""  // $c($zh("93")) should be a left double quote
 i string="\'94" s string=""""
 i string?1"\'"2an s string=$c($zh($p(string,"'",2)))
 i string="\par" s string=$c(13,10)
 // REMOVING \codename and \codename1 and \codename1;
 i string ?1"\"1a.an.1";" q ""
 // REMOVING \codename-20
 i string ?1"\"1.a1"-"1.n.1";" q ""
 i string="{}" q ""
 q string
 }
/// return the contents of a pair of braces (troosers!)
ClassMethod Brace(ByRef R, ByRef i) As %String
{
 s string="{"
 F  {
  Set char=..EXTRACT(.R, $I(i),i+1)
  // reached the end and there will be a loose "}"
  i char="",string="}" s string=""
  q:char=""
  // escaped characters that should be allowed through to the text
  s escape=0
  i $lf($lb("\\","\{","\}"),char) s i=i+1,escape=1
  e  s char=$e(char)
  i char="\" {
  s char=..Slash(.R,.i)
  // we hit a ..Discard so remove the brace before it
  i char=$c(127) s char="",$e(string,*)=""
  s string=string_char
  continue
 }
 i char="{" {
  s char=..Brace(.R,.i)
  i char=$c(13,10) {
   //works for 1st line only s %plainText($i(%line))=string,string="",char=""
  f {
   s %plainText($i(%line))=$p(string,$c(13,10),1)
   s string=$p(string,$c(13,10),2,*)
   q:string=""
  }
  s char=""
  }
  s string=string_char
  continue
  }
  // HEX ascii
  i char="\'" {
   s char=$C($ZHEX(..EXTRACT(.R, $I(i),$I(i))))
  }
  s string=string_$e(char,*)
  i string="{}" s string=""
  q:char="}"
 }
 i string="{}" {
  s string=""
 }
 i $e(string)="{",$e(string,*)="}" {
  s string=$e(string,2,*-1)
 }
 q string
}

0 0