This commit is contained in:
eherbst 2006-10-26 00:44:28 +00:00
parent 0a2fcb8ce5
commit 1f859496bc
6 changed files with 101 additions and 61 deletions

View File

@ -3,6 +3,15 @@ all: report.pdf
PS4PDFDIR=./ps4pdf
#useful for anything other than a standard linux system
#TEXDIR = c:/progra~1/miktex/miktex-2.4/miktex/bin
#TEX = $(TEXDIR)/tex
#PDFLATEX = $(TEXDIR)/pdflatex
#BIBTEX = $(TEXDIR)/bibtex
TEX = tex
PDFLATEX = pdflatex
BIBTEX = bibtex
export TEXINPUTS=:$(PS4PDFDIR)
# generic rule for using ps4pdf
@ -13,8 +22,8 @@ export TEXINPUTS=:$(PS4PDFDIR)
%.pdf: %.tex preview.sty $(PS4PDFDIR)
$(PS4PDFDIR)/ps4pdf $<
if ( grep 'bibstyle' $*.aux >/dev/null 2>/dev/null ) ; then \
bibtex $*; \
pdflatex $<; \
$(BIBTEX) $*; \
$(PDFLATEX) $<; \
fi
if ( \
grep 'ref{\|tableofcontents\|\\listof\|prosper' $*.tex >/dev/null 2>/dev/null \
@ -22,13 +31,13 @@ export TEXINPUTS=:$(PS4PDFDIR)
|| grep 'Citation.*undefined' $*.log >/dev/null 2>/dev/null \
); then \
echo "## Reruning latex"; \
pdflatex $<; \
$(PDFLATEX) $<; \
fi
# "installing" preview.sty, installing the TeX way
preview.sty: preview.ins preview.dtx
tex preview.ins
$(TEX) preview.ins
# download ps4pdf

View File

@ -912,7 +912,7 @@
number = {ITRI-04-10},
institution = {Information Technology Research Institute, University of Brighton},
note = {Also published in Proceedings of TSD 2004, Text, Speech and Dialogue 7th International Conference, Brno, Czech Republic, September 2004}
}
}
@InProceedings{veronis:2003,
@ -933,7 +933,7 @@
% Melo to taky jmeno:
% Melo to taky jmeno:
% Note = {MSM113200006, LN00A063},
@InProceedings{cmejrek:curin:havelka:2003,
DocumentType = {},
@ -1035,10 +1035,10 @@
Series = "Lecture Notes in Artificial Intelligence",
Volume = 2302,
URL = "http://link.springer.de/link/service/series/0558/tocs/t2302.htm",
Abstract = "
Abstract = "
Constraint programming is an approach to modeling and solving combinatorial
problems that has proven succesful in many applications. Building on
techniques developed in AI, logic programming and operations research,
techniques developed in AI, logic programming and operations research,
constraint programming is based on an abstraction that decomposes the
problem solver into a reusable constraint engine and a declarative
program modeling the problem.
@ -1051,7 +1051,7 @@
strategies, such as visual interactive search and parallel search
are covered.
The book is indispensible reading for anyone seriously interested in
The book is indispensible reading for anyone seriously interested in
constraint technology."
}
@ -1061,7 +1061,7 @@
title = "Studien {\"u}ber das deutsche Verbum infinitum",
note = "2nd unrevised edition published 1983 by Max Niemeyer
Verlag, T{\"u}bingen (Linguistische Arbeiten 139).",
}
}
@TECHREPORT{debusmann2004relational,
title = "{A Relational Syntax-Semantics Interface Based on Dependency Grammar}",
@ -1179,7 +1179,7 @@ phenomena in the verbal complex of German verb final sentences.},
}
@Proceedings{ACL:03,
title = {Proceedings of the 41st Annual Meeting of the
title = {Proceedings of the 41st Annual Meeting of the
Association for Computational Linguistics},
booktitle = {Proceedings of the 41st Annual Meeting of the
Association for Computational Linguistics},
@ -2739,15 +2739,15 @@ applications}},
@inProceedings{propbank2,
@inProceedings{propbank2,
key = {Kingsbury,Palmer,2002},
author = {Paul Kingsbury and Martha Palmer},
title = "{From Treebank to PropBank}",
author = {Paul Kingsbury and Martha Palmer},
title = "{From Treebank to PropBank}",
booktitle = {Proceedings of the 3rd International Conference
on Language Resources and Evaluation},
address = {Las Palmas, Spain},
year = {2002}
}
on Language Resources and Evaluation},
address = {Las Palmas, Spain},
year = {2002}
}
@Book{helsche:69,
@ -2956,7 +2956,7 @@ applications}},
@InProceedings{walde-03,
author = {Sabine Schulte im Walde},
title = "{Experiments on the Choice of Features for Learning Verb Classes}",
booktitle = {Proceedings of the 10th Conference of the European Chapter of
booktitle = {Proceedings of the 10th Conference of the European Chapter of
the Association for Computational Linguistics},
pages = {315--322},
year = {2003},
@ -3269,7 +3269,7 @@ applications}},
year = {2003},
volume = {44},
number = {2},
}
}
% ------- knizky z knihovny na MS
@ -3376,7 +3376,7 @@ applications}},
}
% Mel'cukova tlustokniha, co mam od Kateriny Markove
@Book{mel-95,
@Book{mel-95,
author = {Igor Mel'cuk},
title = "{ The Russian Language in the Meaning-Text Perspective}",
publisher = {Wiener Slawistischer Almanach},
@ -3412,7 +3412,7 @@ applications}},
}
@Misc{framenet-03,
author = {Christopher R. Johnson and Miriam R. L. Petruck and Collin F.
author = {Christopher R. Johnson and Miriam R. L. Petruck and Collin F.
Baker and Michael Ellsworth and Josef Ruppenhofer and Charles J. Fillmore},
title = "{FrameNet: Theory and Practice}",
howpublished = {{http://www.icsi.berkeley.edu/~framenet/book/book.html}},
@ -3426,7 +3426,7 @@ applications}},
year = {1962},
}
@InCollection{hasg-03,
@InCollection{hasg-03,
Author = {Haji{\v{c}}ov{\'{a}}, Eva and Sgall, Petr},
title = "{{Dependency syntax in Functional Generative Description}}",
EnglishTitle = {{Dependency syntax in Functional Generative Description}},
@ -3498,7 +3498,7 @@ applications}},
author = {Joan L. Bybee},
title = "{Morphology: A study of the relation between meaning and form }",
publisher = {Benjamins},
year = {1985},
year = {1985},
address={Philadelphia}
}
@ -3535,7 +3535,7 @@ applications}},
journal = "Nordic Journal of Linguistics",
volume = 21,
year = 1998
}
}
@book{davis-01,
author = {Anthony R. Davis},
@ -3624,7 +3624,7 @@ applications}},
}
% disertace Karoliny, taky tam pouziva VALLEX
@PhdThesis{skwarska-04,
@PhdThesis{skwarska-04,
author = {Karol{\'{\i}}na Skwarska},
title = {Konkurence genitivu a akuzativu s tranzitivn{\'{\i}}mi slovesy
v {\v{c}}e{\v{s}}tin{\v{e}}, ru{\v{s}}tin{\v{e}}, pol{\v{s}}tin{\v{e}} a slovin{\v{s}}tin{\v{e}}},
@ -3831,7 +3831,7 @@ applications}},
@inproceedings{salsa-04,
author = {Michael Ellsworth and Katrin Erk and Paul Kingsbury and Sebastian Pado},
title = "{{PropBank}, {SALSA}, and {FrameNet}: How Design Determines Product}",
booktitle = {Proceedings of the LREC 2004 Workshop
booktitle = {Proceedings of the LREC 2004 Workshop
on Building Lexical Resources from Semantically Annotated Corpora},
year = {2004},
address = {Lisbon},
@ -3852,7 +3852,7 @@ applications}},
@Article{semcomparison-03,
Author = {Rambow, Owen and Dorr, Bonnie and Kipper, Karin and Ku{\v{c}}erov{\'{a}}, Ivona and Palmer, Martha},
Title = {{Automatically Deriving Tectogrammatical Labels from Other Resources:
Title = {{Automatically Deriving Tectogrammatical Labels from Other Resources:
A Comparison of Semantic Labels Across Frameworks}},
EnglishTitle = {{Automatically Deriving Tectogrammatical Labels from Other Resources:
A Comparison of Semantic Labels Across Frameworks}},
@ -3950,7 +3950,7 @@ applications}},
volume = {32},
number = {2--3}
}
@Article{pasm-04,
author = {Karel Pala and Pavel Smr{\v{z}}},
@ -3964,7 +3964,7 @@ applications}},
@InProceedings{briscoe-01,
author = {Ted Briscoe},
title = {From dictionary to corpus to self-organizing dictionary:
title = {From dictionary to corpus to self-organizing dictionary:
learning valency associations in the face of variation and change},
booktitle = {Proceedings of Corpus Linguistics 2001},
pages = {79-89},
@ -4205,7 +4205,7 @@ from a general-purpose lexical resource, with the assumption that the
lexical resource describes the word senses of English/French/\ldots,
between which NLP applications will need to disambiguate. The
implication of the paper is, by contrast, that word
senses exist only relative to a task.
senses exist only relative to a task.
}
}
@ -4740,6 +4740,37 @@ strom\r{u} Pra\v{z}sk\'eho z\'avislostn\'{\i}ho korpusu}},
year = {1996},
}
%%% Evan %%%
@inproceedings
{
bleu,
author = {Papineni, K., et. al.},
title = {BLEU: a Method for Automatic Evaluation of Machine Translation},
booktitle = {Meeting of the Association for Computational Linguistics},
year = {2002},
pages = {311 - 318}
}
@inproceedings
{
perplexity,
author = {Nabhan, A., and A. Rafea},
title = {Tuning Statistical Machine Translation Parameters Using Perplexity},
booktitle = {IEEE Conference on Information Reuse and Integration},
year = {2005}
}
@inproceedings
{
errMeasures,
author = {Popovic, M., et. al.},
title = {Morpho-syntactic Information for Automatic Error Analysis of Statistical Machine Translation Output},
booktitle = {ACL Workshop on Statistical Machine Translation},
year = {2006},
pages = {1 - 6}
}
%%% end Evan %%%
@InProceedings{arukel:04,
author = {Abhishek Arun and Frank Keller},
title = {Lexicalization in crosslinguistic probabilistic parsing: the case of French},
@ -5275,7 +5306,7 @@ strom\r{u} Pra\v{z}sk\'eho z\'avislostn\'{\i}ho korpusu}},
address = {Oxford, UK},
annote = {}
}
@Article{moh:97,
author = {Mehryar Mohri},
title = {Finite-State Transducers in Language and Speech Processing},
@ -5287,11 +5318,9 @@ strom\r{u} Pra\v{z}sk\'eho z\'avislostn\'{\i}ho korpusu}},
}
%article
% An article from a journal or magazine. Required fields: author, title,
% journal, year. Optional fields: volume, number, pages, month, note.
% journal, year. Optional fields: volume, number, pages, month, note.
%article{XXX,
% author = {},
% title = "{}",
@ -5306,7 +5335,7 @@ strom\r{u} Pra\v{z}sk\'eho z\'avislostn\'{\i}ho korpusu}},
%book
% A book with an explicit publisher. Required fields: author or editor,
% title, publisher, year. Optional fields: volume or number, series, address,
% edition, month, note.
% edition, month, note.
%book{XXX,
% author/editor = {},
% title = "{}",
@ -5323,14 +5352,14 @@ strom\r{u} Pra\v{z}sk\'eho z\'avislostn\'{\i}ho korpusu}},
%booklet
% A work that is printed and bound, but without a named publisher or
% sponsoring institution. Required field: title. Optional fields: author,
% howpublished, address, month, year, note.
% howpublished, address, month, year, note.
%conference
% The same as INPROCEEDINGS, included for Scribe compatibility.
% The same as INPROCEEDINGS, included for Scribe compatibility.
%inbook
% A part of a book, which may be a chapter (or section or whatever) and/or a
% range of pages. Required fields: author or editor, title, chapter and/or
% pages, publisher, year. Optional fields: volume or number, series, type,
% address, edition, month, note.
% address, edition, month, note.
%inbook{XXX,
% author/editor = {},
% title = "{}",
@ -5348,7 +5377,7 @@ strom\r{u} Pra\v{z}sk\'eho z\'avislostn\'{\i}ho korpusu}},
%incollection
% A part of a book having its own title. Required fields: author, title,
% booktitle, publisher, year. Optional fields: editor, volume or number,
% series, type, chapter, pages, address, edition, month, note.
% series, type, chapter, pages, address, edition, month, note.
%incollection{XXX,
% author = {},
% title = "{}",
@ -5369,7 +5398,7 @@ strom\r{u} Pra\v{z}sk\'eho z\'avislostn\'{\i}ho korpusu}},
%inproceedings
% An article in a conference proceedings. Required fields: author, title,
% booktitle, year. Optional fields: editor, volume or number, series, pages,
% address, month, organization, publisher, note.
% address, month, organization, publisher, note.
%inProceedings{XXX,
% author = {},
% title = "{}",
@ -5388,16 +5417,16 @@ strom\r{u} Pra\v{z}sk\'eho z\'avislostn\'{\i}ho korpusu}},
%}
%manual
% Technical documentation. Required field: title. Optional fields: author,
% organization, address, edition, month, year, note.
% organization, address, edition, month, year, note.
%mastersthesis
% A Master's thesis. Required fields: author, title, school, year. Optional
% fields: type, address, month, note.
% fields: type, address, month, note.
%misc
% Use this type when nothing else fits. Required fields: none. Optional
% fields: author, title, howpublished, month, year, note.
% fields: author, title, howpublished, month, year, note.
%phdthesis
% A PhD thesis. Required fields: author, title, school, year. Optional
% fields: type, address, month, note.
% fields: type, address, month, note.
%phdthesis{XXX,
% author = {},
% title = "{}",
@ -5411,11 +5440,11 @@ strom\r{u} Pra\v{z}sk\'eho z\'avislostn\'{\i}ho korpusu}},
%proceedings
% The proceedings of a conference. Required fields: title, year. Optional
% fields: editor, volume or number, series, address, month, organization,
% publisher, note.
% publisher, note.
%techreport
% A report published by a school or other institution, usually numbered
% within a series. Required fields: author, title, institution, year.
% Optional fields: type, number, address, month, note.
% Optional fields: type, number, address, month, note.
%techreport{XXX,
% author = {},
% title = "{}",

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

View File

@ -10,6 +10,7 @@
\usepackage{epic,eepic}
\usepackage{boxedminipage}
\usepackage{fancybox}
\usepackage{subfig}
\usepackage[square]{natbib}
\usepackage{ps4pdf}
\usepackage{picins} % pictures next to paragraphs, Ondrej's part
@ -411,7 +412,8 @@ resulted in an extension of the standard text decoder.
\begin{center}
\label{fig:wer-bleu}
\caption{Relationship between BLEU score of target strings and word-error-rate (WER) of source strings.
Source strings are transcriptions of parliamentary speeches produced by a speech recognition system (To be checked) .}\includegraphics[width=10cm]{marcello-bleu-wer}
Source strings are transcriptions of parliamentary speeches produced by a speech recognition system (To be checked) .}
\includegraphics[width=10cm]{marcello-bleu-wer}
\end{center}
\end{figure}
@ -669,7 +671,7 @@ By adding factored translation to conventional phrase based decoding we hope to
Resource consumption is of great importance to researchers as it often determine whether or not experiments can be run or what compromises needs to be taken. We therefore also benchmarked resource usage against another phrase-based decoder, Pharaoh, as well as other decoders, to ensure that they were comparable in like-for-like decoding.\\
It is essential that features can be easily added, changed or replace, and that the decoder can be used as a ‘toolkit’ in ways not originally envisaged. We followed strict object oriented methodology; all functionality was abstracted into classes which can be more readily changed and extended. For example, we have two implementations of single factor language models which can be used depending on the functionality and licensing terms required. Other implementations for very large and distributed LMs are in the pipeline and can easily be integrated into Moses. The framework also allows for factored LMs; a joint factor and skipping LM are currently available.\\
It is essential that features can be easily added, changed or replace, and that the decoder can be used as a ‘toolkit’ in ways not originally envisaged. We followed strict object oriented methodology; all functionality was abstracted into classes which can be more readily changed and extended. For example, we have two implementations of single factor language models which can be used depending on the functionality and licensing terms required. Other implementations for very large and distributed LMs are in the pipeline and can easily be integrated into Moses. The framework also allows for factored LMs; a joint factor and skipping LM are currently available.\\
\begin{center}
\begin{figure}[h]
\centering
@ -727,7 +729,7 @@ Its outline is shown below\\
\indent \indent for each hypothesis in stack\\
\indent \indent \indent ProcessOneHypothesis()\\
\end{tt}\\
Each contiguous word coverage (‘span’) of the source sentence is analysed in\\
Each contiguous word coverage (‘span’) of the source sentence is analysed in\\
\indent {\tt CreateTranslationOptions() }\\
\\
and translations are created for that span. Then each hypothesis in each stack is processed in a loop. This loop starts with the stack where nothing has been translated which has been initialised with one empty hypothesis.
@ -756,9 +758,9 @@ which is out follows\\
\begin{tt}
\indent ProcessInitialTranslation()\\
\indent for every subequent decoding step\\
\indent \indent if step is ‘Translation’\\
\indent \indent if step is ‘Translation’\\
\indent \indent \indent DecodeStepTranslation::Process()\\
\indent \indent else if step is ‘Generation’\\
\indent \indent else if step is ‘Generation’\\
\indent \indent \indent DecodeStepGeneration::Process()\\
\indent Store translation options for use by decoder\\
\end{tt}
@ -770,7 +772,7 @@ However, each decoding step, whether translation or generation, is a subclass of
so that the correct Process() is selected by polymorphism rather than using if statements as outlined above.
\subsection{Unknown Word Processing}
After translation options have been created for all contiguous spans, some positions may not have any translation options which covers it. In these cases, CreateTranslationOptionsForRange() is called again but the table limits on phrase and generation tables are ignored. \\
If this still fails to cover the position, then a new target word is create by copying the string for each factor from the untranslatable source word, or the string ‘UNK’ if the source factor is null.\\
If this still fails to cover the position, then a new target word is create by copying the string for each factor from the untranslatable source word, or the string ‘UNK’ if the source factor is null.\\
\begin{center}
\begin{tabular}{|c|c|c|}
\hline
@ -994,7 +996,7 @@ If new regression tests have new data dependencies, the test data
will need to be updated. For more information on this workflow,
refer to the previous section.
\subsection{Accessability}
\subsection{Accessibility}
The source code for the Moses project is housed at Sourceforge.net
in a subversion repository. The URL for the project is:
@ -1248,7 +1250,7 @@ We describe some statistics generally used to measure error and present two erro
\subsection{Error Measurement}
There are three common measures of translation error. BiLingual Evaluation Understudy (BLEU) (\cite{bleu}), the most common, measures matches of short phrases between the translated and reference text as well as the difference in the lengths of the reference and output. BLEU can be applied to multiple references, but in a way such that BLEU scores using different numbers of references are not comparable.
Word Error Rate (WER) measures the number of matching output and reference words given that if output word $i$ is noted as matching reference word $j$, output word $i + 1$ cannot match any reference word before $j$; i.e., word ordering is preserved in both texts. Such a mapping isn't unique, so WER is specified using the maximum attainable number of single-word matches. This number is computable by some simple dynamic programming. [[[Ought I to elaborate here?]]]
Word Error Rate (WER) measures the number of matching output and reference words given that if output word $i$ is noted as matching reference word $j$, output word $i + 1$ cannot match any reference word before $j$; i.e., word ordering is preserved in both texts. Such a mapping isn't unique, so WER is specified using the maximum attainable number of single-word matches. This number is computable by some simple dynamic programming.
Position-Independent Word Error Rate (PWER) simply counts matching output and reference words regardless of their order in the text. This allows for rearrangement of logical units of text, but allows a system to get away with poor rearrangement of function words.
@ -1269,8 +1271,8 @@ The overall view for a corpus shows a list of files associated with a given corp
\centering
\caption{Sample output of corpus-statistics tool.}
\label{fig:sentence_by_sentence_screenshot}
%\subfloat[detailed view of sentences]{\frame{\vspace{.05in}\hspace{.05in}\includegraphics[width=6in]{}\hspace{.05in}\vspace{.05in}}} \newline
%\subfloat[overall corpus view]{\frame{\vspace{.05in}\hspace{.05in}\includegraphics[width=6in]{}\hspace{.05in}\vspace{.05in}}}
\subfloat[detailed view of sentences]{\frame{\vspace{.05in}\hspace{.05in}\includegraphics[width=6in]{images/sentence-by-sentence_multiref_screenshot.png}\hspace{.05in}\vspace{.05in}}} \newline
\subfloat[overall corpus view]{\frame{\vspace{.05in}\hspace{.05in}\includegraphics[width=6in]{images/corpus_overview_screenshot_de-en.png}\hspace{.05in}\vspace{.05in}}}
\end{figure}
A second tool developed during the workshop shows the mapping of individual source to output phrases (boxes of the same color on the two lines in figure \ref{fig:phrases_used_screenshot}) and gives the average source phrase length used. This statistic tells us how much use is being made of the translation model's capabilities. There's no need to take the time to tabulate all phrases of length 10, say, in the training source text if we're pretty sure that at translation time no source phrase longer than 4 words will be chosen.
@ -1279,7 +1281,7 @@ A second tool developed during the workshop shows the mapping of individual sour
\centering
\caption{Sample output of phrase-detail tool.}
\label{fig:phrases_used_screenshot}
%\subfloat[]{\frame{\vspace{.05in}\hspace{.05in}\includegraphics[width=5in]{}\hspace{.05in}\vspace{.05in}}}
\frame{\vspace{.05in}\hspace{.05in}\includegraphics[width=5in]{images/show-phrases-used_crossover_screenshot.png}\hspace{.05in}\vspace{.05in}}
\end{figure}
%{\sc Evan Herbst}
@ -2421,8 +2423,8 @@ would be necessary to describe the linguistic reality more adequately.
%<f>\v{r}ekl<MDl src="a">\v{r}\'{\i}ci<MDt src="a">VpYS---XR-AA---<A>Pred<r>9<g>0
%<f>mluv\v{c}\'{\i}<MDl src="a">mluv\v{c}\'{\i}<MDt src="a">NNMS1-----A----<A>Sb<r>10<g>9
%poptávka trvale stoupá za podpory prospotřebitelské vládní politiky , řekl mluvčí asociace .
%MEMBER:subst+1 --- HEAD:VB:stoupat_:T MEMBER:Rza-1+2 --- --- --- --- MEMBER:Z- HEAD:Vp:říci MEMBER:subst+1 --- PUNCT:.
%poptávka trvale stoupá za podpory prospotÅ™ebitelské vládní politiky , Å™ekl mluvÄ<76>í asociace .
%MEMBER:subst+1 --- HEAD:VB:stoupat_:T MEMBER:Rza-1+2 --- --- --- --- MEMBER:Z- HEAD:Vp:říci MEMBER:subst+1 --- PUNCT:.
%<f>\broken{Popt\'{a}vka\\MEMBER:subst+1}<A>Sb<r>1<g>3
%<f>\broken{trvale\\---}<A>Adv<r>2<g>3
@ -2432,7 +2434,7 @@ would be necessary to describe the linguistic reality more adequately.
%<f>\broken{vl\'{a}dn\'{\i}\\---}<A>Atr<r>6<g>7
%<f>\broken{politiky\\---}<A>Atr<r>7<g>5
%<f>\broken{,\\MEMBER:Z-}<A>AuxX<r>8<g>3
%<f>\broken{\v{r}ekl\\HEAD:Vp:říci}<A>Pred<r>9<g>0
%<f>\broken{\v{r}ekl\\HEAD:Vp:říci}<A>Pred<r>9<g>0
%<f>\broken{mluv\v{c}\'{\i}\\MEMBER:subst+1}<A>Sb<r>10<g>9
%<f>\broken{.\\PUNCT:.}<A>AuxK<r>11<g>0