Discussion (3)0
Log in or sign up to continue

Hi,

I knocked up some code to extract the plain text from an RTF document. It works for my purposes but would like to know if anyone can find a case where it does a bad job.

Copy your .rtf file into a flat array, eg x(1)=first line, x(2)=second line then:

d ##class(yourclass).StripRTF(.x,.y)

and you'll get the plain text in y

/// accepts an RTF doc in array form, doesn't care if lines are split across array members
/// returns plain text from the doc with one array item per line 
/// quit value is number of lines in array
ClassMethod StripRTF(ByRef rtfText = "", ByRef %plainText) As %Integer
{
 // use this code to view an rtf doc split into groups and indented when group level goes up
 // S IN=0,C=0 F I=1:1 S C=##class(ZSS.SMTP).EXTRACT(.x,I) Q:C="" I C="{" {S IN=IN+1 W !?IN*2,"(",IN,")"} W C I C="}" {W "(",IN,")" S IN=IN-1 W !?IN*2}
%line
 kill %plainText Set %line=0
 // you could add some speed here by working out how to set characters in the R array to null and
 // reducing the i pointer accordingly
R=rtfText
..Brace(.R,$i(i))
%line
}

/// A bit like $Extract but the first argument is a single level array, like array(1)="some text", array(2)="some more text"
/// ..EXTRACT(.array,9,11) would return "tso" as the 9th, 10th, and 11th characters of the data in the array
ClassMethod EXTRACT(ByRef array, from As %Integer, to As %Integer, SetToNull = 0) As %String
{
'$d(to) to=from
UncleLen=from-to+1 // length of string to return
string=""
index=1:1 {
  q:'$d(array(index))
short=array(index)
from'>$l(short) {
string=string_$e(short,from,to)
   // mimic SET $EXTRACT
SetToNull $e(array(index),from,to)=""
   q:$l(string)=UncleLen
  }
from=from-$l(short),to=to-$l(short)
from<0 from=0
 }
string
}
/// discard everything between two braces including the braces
/// done by either moving i to the last brace
/// or if SetToNull is passed as 1 then actually removing the characters from the RTF array
ClassMethod Discard(ByRef rtfText, ByRef i, SetToNull = 0)
{
inGroup=1,start=i,stop=i,discarded=""
 f  {
discard=..EXTRACT(.rtfText,$i(i))
  q:discard=""
discard="}" {
inGroup=inGroup-1
  }
discard="{" inGroup=inGroup+1
 q
 }
}
SetToNull ..EXTRACT(.rtfText,start,i,1)
}
/// return the contents of a Slash
ClassMethod Slash(ByRef R, ByRef i) As %String
{
string="\"
 F  {
  Set char=..EXTRACT(.R, $I(i))
  q:char="" // should never happen but don't want to get stuck in a loop because of a bad file
char="}" i=i-1 q
char=" " q
char="\" i=i-1 q
char="{" {
string?1"\"1a.an {
    // everything inside something like \abc1{this stuff}
    // example {\fonttbl{\fprq2{02020603050405020304}TimesNew Roman;} should all be discarded
..Discard(.R,.i)
string=$c(127)
    q
   }
char=..Brace(.R,.i)
  }
char="*",..EXTRACT(.R, i-2,i+1)="{\*\" {
   // "{\*\" at the start of a brace means the whole brace including nested braces can be ignored
..Discard(.R,.i)
string=$c(127)
   q
  }
char="'" {
string=string_char_..EXTRACT(.R, $I(i),$I(i))
   q
  }
string=string_char
  // everything in here should disappear because it looks like {\stylesheet{
string="\stylesheet"!(string="\info") {
..Discard(.R,.i)
string=$c(127)
   q
  }
 }
 //
 // add stuff here for special characters represented by \codeword
 // you could put them on a global
 //
string'="" {
  try {
  // catching stupid subscripts
string=$G(^GMAT("RTF","special characters",string),string)
  }
  catch {
   // do nowt
  }
 }
string="\lquote" string="'"
string="\rquote" string="'"
string="\ldblquote" string=""""
string="\rdblquote" string=""""
string="\'93" string=""""  // $c($zh("93")) should be a left double quote
string="\'94" string=""""
string?1"\'"2an string=$c($zh($p(string,"'",2)))
string="\par" string=$c(13,10)
 // REMOVING \codename and \codename1 and \codename1;
string ?1"\"1a.an.1";" ""
 // REMOVING \codename-20
string ?1"\"1.a1"-"1.n.1";" ""
string="{}" ""
string
 }
/// return the contents of a pair of braces (troosers!)
ClassMethod Brace(ByRef R, ByRef i) As %String
{
string="{"
 F  {
  Set char=..EXTRACT(.R, $I(i),i+1)
  // reached the end and there will be a loose "}"
char="",string="}" string=""
  q:char=""
  // escaped characters that should be allowed through to the text
escape=0
$lf($lb("\\","\{","\}"),char) i=i+1,escape=1
  e  char=$e(char)
char="\" {
char=..Slash(.R,.i)
  // we hit a ..Discard so remove the brace before it
char=$c(127) char="",$e(string,*)=""
string=string_char
  continue
 }
char="{" {
char=..Brace(.R,.i)
char=$c(13,10) {
   //works for 1st line only s %plainText($i(%line))=string,string="",char=""
{
%plainText($i(%line))=$p(string,$c(13,10),1)
string=$p(string,$c(13,10),2,*)
   q:string=""
  }
char=""
  }
string=string_char
  continue
  }
  // HEX ascii
char="\'" {
char=$C($ZHEX(..EXTRACT(.R, $I(i),$I(i))))
  }
string=string_$e(char,*)
string="{}" string=""
  q:char="}"
 }
string="{}" {
string=""
 }
$e(string)="{",$e(string,*)="}" {
string=$e(string,2,*-1)
 }
string
}