citromatik wrote:
> Hi all,
>
> I'm trying to parse a plain text file containing multiple records separated
> by a "//". A record sample can be viewed
> http://metagenomics.uv.es/gbexample.txt here .
>
> Extracting simple fields like the "ACCESSION" number is quite simple:
>
> let LLeter = ['A'-'Z']
> let digit = ['0'-'9']
> let space = ' '
> let ACCESSION = "ACCESSION" space+ LLeter LLeter digit+
> rule gb = parse
> | ACCESSION { ...; gb lexbuf }
> | _ { gb lexbuf }
>
> ...But, what about those multiline records? how can I extract them?
> I've tried using '#'. For example, for obtaining a full "REFERENCE":
>
> let endline = '\n'
> let KWD = endline LLeter+
> let REFERENCE = "REFERENCE" _+
> rule gb = parse
> | ACCESSION { ...; gb lexbuf }
> | REFERENCE#KWD { print_endline (Lexing.lexeme lexbuf); gb lexbuf }
> (* Line 16 *)
> | _ { gb lexbuf }
>
> But this gives me an error when trying to run ocamllex on it:
>
> File "genbank.mll", line 16, character 67: character set expected.
>
> What is this "character set expected" error?
a#b means "any char from a that does not belong to b".
Your KWD does not represent a set of chars.
You can write:
let az = ['a-'z']
let x = az # ['d'-'h']
> Is there a better (well, good) way to parse the multiline fields?
Maybe someone already wrote an OCaml parser for Genbank (I know I don't have a
complete one, if any; you may ask on caml-list).
Solution 1: don't use ocamllex at all
Process the file line by line (input_line is fine).
Create yourself a fast test_string_prefix function, "fast" boiling down to not
using String.sub.
Then write a pure OCaml parser whose input is the stream of lines. This is not
very different from solution 2 below, which I would choose.
Solution 2: use only ocamllex
Here is the structure of a reasonable parser:
{
type record = {
mutable locus : ...;
mutable definition : string option;
mutable accession : ...;
...
}
let new_record () = {
locus = None;
definition = None;
accession = None;
...
}
let newline lexbuf =
...
(* would set the correct line count
for useful error messages *)
}
rule top record = parse
"LOCUS " ...
{ ... }
| "DEFINITION " ([^'\r' '\n']+ as text) '\r'? '\n'
{
newline lexbuf;
let def_text = continue_definition [text] lexbuf in
if record.definition <> None then
... (* error: multiple DEFINITION fields *);
record.definition <- Some def_text
}
| "ACCESSION " ...
{ ... }
...
| "//" '\r'? '\n'
{
newline lexbuf;
Some record
}
| eof
{
... ; (* check that the current record is empty *)
None
}
| ""
{ (* report error *) }
and continue_definition accu = parse
" " ([^'\r' '\n']+ as text) '\r'? '\n'
{
newline lexbuf;
continue_definition (text :: accu) lexbuf
}
| ""
{ String.concat " " (List.rev accu) }
{
let rec scan process_record lexbuf =
match top (new_record ()) lexbuf with
None -> ()
| Some x ->
process_record x;
scan process_record lexbuf
}
Martin
--
http://mjambon.com/