├── .gitattributes
├── .gitignore
├── FastRDFStore.sln
├── FastRDFStore
├── App.config
├── FastRDFStore.cs
├── FastRDFStore.csproj
├── FastRDFStore.xproj
├── FreebaseCommonTypes.cs
├── IFastRDFStore.cs
├── Logger.cs
├── Program.cs
├── Properties
│ └── AssemblyInfo.cs
├── deploy-debug.bat
├── deploy.bat
└── packages.config
├── FastRDFStoreClient
├── App.config
├── FastRDFStoreClient.csproj
├── Program.cs
├── Properties
│ └── AssemblyInfo.cs
└── packages.config
├── FreebaseToRDFStore
├── FreebaseToRDFStore.csproj
├── Program.cs
├── Properties
│ └── AssemblyInfo.cs
├── app.config
└── packages.config
├── LICENSE.docx
├── LICENSE.md
├── README.md
├── SECURITY.md
├── SharedDataTypes
├── Config.cs
└── FreebaseCommonTypes.cs
├── bin
├── Acknowledgment.md
├── CommandLine.dll
├── FastRDFStore.exe
├── FastRDFStoreClient.exe
├── FreebaseToRDFStore.exe
└── README.md
└── data
└── README.md
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 |
7 | # Standard to msysgit
8 | *.doc diff=astextplain
9 | *.DOC diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot diff=astextplain
13 | *.DOT diff=astextplain
14 | *.pdf diff=astextplain
15 | *.PDF diff=astextplain
16 | *.rtf diff=astextplain
17 | *.RTF diff=astextplain
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 |
4 | # User-specific files
5 | *.suo
6 | *.user
7 | *.userosscache
8 | *.sln.docstates
9 |
10 | # User-specific files (MonoDevelop/Xamarin Studio)
11 | *.userprefs
12 |
13 | # Build results
14 | [Dd]ebug/
15 | [Dd]ebugPublic/
16 | [Rr]elease/
17 | [Rr]eleases/
18 | x64/
19 | x86/
20 | bld/
21 | [Bb]in/
22 | [Oo]bj/
23 | [Ll]og/
24 |
25 | # Visual Studio 2015 cache/options directory
26 | .vs/
27 | # Uncomment if you have tasks that create the project's static files in wwwroot
28 | #wwwroot/
29 |
30 | # MSTest test Results
31 | [Tt]est[Rr]esult*/
32 | [Bb]uild[Ll]og.*
33 |
34 | # NUNIT
35 | *.VisualState.xml
36 | TestResult.xml
37 |
38 | # Build Results of an ATL Project
39 | [Dd]ebugPS/
40 | [Rr]eleasePS/
41 | dlldata.c
42 |
43 | # DNX
44 | project.lock.json
45 | artifacts/
46 |
47 | *_i.c
48 | *_p.c
49 | *_i.h
50 | *.ilk
51 | *.meta
52 | *.obj
53 | *.pch
54 | *.pdb
55 | *.pgc
56 | *.pgd
57 | *.rsp
58 | *.sbr
59 | *.tlb
60 | *.tli
61 | *.tlh
62 | *.tmp
63 | *.tmp_proj
64 | *.log
65 | *.vspscc
66 | *.vssscc
67 | .builds
68 | *.pidb
69 | *.svclog
70 | *.scc
71 |
72 | # Chutzpah Test files
73 | _Chutzpah*
74 |
75 | # Visual C++ cache files
76 | ipch/
77 | *.aps
78 | *.ncb
79 | *.opendb
80 | *.opensdf
81 | *.sdf
82 | *.cachefile
83 | *.VC.db
84 | *.VC.VC.opendb
85 |
86 | # Visual Studio profiler
87 | *.psess
88 | *.vsp
89 | *.vspx
90 | *.sap
91 |
92 | # TFS 2012 Local Workspace
93 | $tf/
94 |
95 | # Guidance Automation Toolkit
96 | *.gpState
97 |
98 | # ReSharper is a .NET coding add-in
99 | _ReSharper*/
100 | *.[Rr]e[Ss]harper
101 | *.DotSettings.user
102 |
103 | # JustCode is a .NET coding add-in
104 | .JustCode
105 |
106 | # TeamCity is a build add-in
107 | _TeamCity*
108 |
109 | # DotCover is a Code Coverage Tool
110 | *.dotCover
111 |
112 | # NCrunch
113 | _NCrunch_*
114 | .*crunch*.local.xml
115 | nCrunchTemp_*
116 |
117 | # MightyMoose
118 | *.mm.*
119 | AutoTest.Net/
120 |
121 | # Web workbench (sass)
122 | .sass-cache/
123 |
124 | # Installshield output folder
125 | [Ee]xpress/
126 |
127 | # DocProject is a documentation generator add-in
128 | DocProject/buildhelp/
129 | DocProject/Help/*.HxT
130 | DocProject/Help/*.HxC
131 | DocProject/Help/*.hhc
132 | DocProject/Help/*.hhk
133 | DocProject/Help/*.hhp
134 | DocProject/Help/Html2
135 | DocProject/Help/html
136 |
137 | # Click-Once directory
138 | publish/
139 |
140 | # Publish Web Output
141 | *.[Pp]ublish.xml
142 | *.azurePubxml
143 | # TODO: Comment the next line if you want to checkin your web deploy settings
144 | # but database connection strings (with potential passwords) will be unencrypted
145 | *.pubxml
146 | *.publishproj
147 |
148 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
149 | # checkin your Azure Web App publish settings, but sensitive information contained
150 | # in these scripts will be unencrypted
151 | PublishScripts/
152 |
153 | # NuGet Packages
154 | *.nupkg
155 | # The packages folder can be ignored because of Package Restore
156 | **/packages/*
157 | # except build/, which is used as an MSBuild target.
158 | !**/packages/build/
159 | # Uncomment if necessary however generally it will be regenerated when needed
160 | #!**/packages/repositories.config
161 | # NuGet v3's project.json files produces more ignoreable files
162 | *.nuget.props
163 | *.nuget.targets
164 |
165 | # Microsoft Azure Build Output
166 | csx/
167 | *.build.csdef
168 |
169 | # Microsoft Azure Emulator
170 | ecf/
171 | rcf/
172 |
173 | # Windows Store app package directories and files
174 | AppPackages/
175 | BundleArtifacts/
176 | Package.StoreAssociation.xml
177 | _pkginfo.txt
178 |
179 | # Visual Studio cache files
180 | # files ending in .cache can be ignored
181 | *.[Cc]ache
182 | # but keep track of directories ending in .cache
183 | !*.[Cc]ache/
184 |
185 | # Others
186 | ClientBin/
187 | ~$*
188 | *~
189 | *.dbmdl
190 | *.dbproj.schemaview
191 | *.pfx
192 | *.publishsettings
193 | node_modules/
194 | orleans.codegen.cs
195 |
196 | # Since there are multiple workflows, uncomment next line to ignore bower_components
197 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
198 | #bower_components/
199 |
200 | # RIA/Silverlight projects
201 | Generated_Code/
202 |
203 | # Backup & report files from converting an old project file
204 | # to a newer Visual Studio version. Backup files are not needed,
205 | # because we have git ;-)
206 | _UpgradeReport_Files/
207 | Backup*/
208 | UpgradeLog*.XML
209 | UpgradeLog*.htm
210 |
211 | # SQL Server files
212 | *.mdf
213 | *.ldf
214 |
215 | # Business Intelligence projects
216 | *.rdl.data
217 | *.bim.layout
218 | *.bim_*.settings
219 |
220 | # Microsoft Fakes
221 | FakesAssemblies/
222 |
223 | # GhostDoc plugin setting file
224 | *.GhostDoc.xml
225 |
226 | # Node.js Tools for Visual Studio
227 | .ntvs_analysis.dat
228 |
229 | # Visual Studio 6 build log
230 | *.plg
231 |
232 | # Visual Studio 6 workspace options file
233 | *.opt
234 |
235 | # Visual Studio LightSwitch build output
236 | **/*.HTMLClient/GeneratedArtifacts
237 | **/*.DesktopClient/GeneratedArtifacts
238 | **/*.DesktopClient/ModelManifest.xml
239 | **/*.Server/GeneratedArtifacts
240 | **/*.Server/ModelManifest.xml
241 | _Pvt_Extensions
242 |
243 | # Paket dependency manager
244 | .paket/paket.exe
245 | paket-files/
246 |
247 | # FAKE - F# Make
248 | .fake/
249 |
250 | # JetBrains Rider
251 | .idea/
252 | *.sln.iml
253 |
254 | # =========================
255 | # Operating System Files
256 | # =========================
257 |
258 | # OSX
259 | # =========================
260 |
261 | .DS_Store
262 | .AppleDouble
263 | .LSOverride
264 |
265 | # Thumbnails
266 | ._*
267 |
268 | # Files that might appear in the root of a volume
269 | .DocumentRevisions-V100
270 | .fseventsd
271 | .Spotlight-V100
272 | .TemporaryItems
273 | .Trashes
274 | .VolumeIcon.icns
275 |
276 | # Directories potentially created on remote AFP share
277 | .AppleDB
278 | .AppleDesktop
279 | Network Trash Folder
280 | Temporary Items
281 | .apdisk
282 |
283 | # Windows
284 | # =========================
285 |
286 | # Windows image file caches
287 | Thumbs.db
288 | ehthumbs.db
289 |
290 | # Folder config file
291 | Desktop.ini
292 |
293 | # Recycle Bin used on file shares
294 | $RECYCLE.BIN/
295 |
296 | # Windows Installer files
297 | *.cab
298 | *.msi
299 | *.msm
300 | *.msp
301 |
302 | # Windows shortcuts
303 | *.lnk
304 |
--------------------------------------------------------------------------------
/FastRDFStore.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 14
4 | VisualStudioVersion = 14.0.25420.1
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{B4259ED8-B959-4F01-AF76-983D3C30811C}"
7 | ProjectSection(SolutionItems) = preProject
8 | SharedDataTypes\FreebaseCommonTypes.cs = SharedDataTypes\FreebaseCommonTypes.cs
9 | EndProjectSection
10 | EndProject
11 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FreebaseToRDFStore", "FreebaseToRDFStore\FreebaseToRDFStore.csproj", "{DFE0B2DA-CDB9-4B5C-8472-E04AD2E9C9EC}"
12 | EndProject
13 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FastRDFStoreClient", "FastRDFStoreClient\FastRDFStoreClient.csproj", "{06F0469F-5ACC-4937-89AF-CD5436132C02}"
14 | EndProject
15 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "FastRDFStore", "FastRDFStore\FastRDFStore.csproj", "{9E64D23C-9C19-49DA-9903-12F151140AE7}"
16 | EndProject
17 | Global
18 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
19 | Debug|Any CPU = Debug|Any CPU
20 | Debug|x64 = Debug|x64
21 | Release|Any CPU = Release|Any CPU
22 | Release|x64 = Release|x64
23 | EndGlobalSection
24 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
25 | {DFE0B2DA-CDB9-4B5C-8472-E04AD2E9C9EC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
26 | {DFE0B2DA-CDB9-4B5C-8472-E04AD2E9C9EC}.Debug|Any CPU.Build.0 = Debug|Any CPU
27 | {DFE0B2DA-CDB9-4B5C-8472-E04AD2E9C9EC}.Debug|x64.ActiveCfg = Debug|x64
28 | {DFE0B2DA-CDB9-4B5C-8472-E04AD2E9C9EC}.Debug|x64.Build.0 = Debug|x64
29 | {DFE0B2DA-CDB9-4B5C-8472-E04AD2E9C9EC}.Release|Any CPU.ActiveCfg = Release|Any CPU
30 | {DFE0B2DA-CDB9-4B5C-8472-E04AD2E9C9EC}.Release|Any CPU.Build.0 = Release|Any CPU
31 | {DFE0B2DA-CDB9-4B5C-8472-E04AD2E9C9EC}.Release|x64.ActiveCfg = Release|x64
32 | {DFE0B2DA-CDB9-4B5C-8472-E04AD2E9C9EC}.Release|x64.Build.0 = Release|x64
33 | {06F0469F-5ACC-4937-89AF-CD5436132C02}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
34 | {06F0469F-5ACC-4937-89AF-CD5436132C02}.Debug|Any CPU.Build.0 = Debug|Any CPU
35 | {06F0469F-5ACC-4937-89AF-CD5436132C02}.Debug|x64.ActiveCfg = Debug|x64
36 | {06F0469F-5ACC-4937-89AF-CD5436132C02}.Debug|x64.Build.0 = Debug|x64
37 | {06F0469F-5ACC-4937-89AF-CD5436132C02}.Release|Any CPU.ActiveCfg = Release|Any CPU
38 | {06F0469F-5ACC-4937-89AF-CD5436132C02}.Release|Any CPU.Build.0 = Release|Any CPU
39 | {06F0469F-5ACC-4937-89AF-CD5436132C02}.Release|x64.ActiveCfg = Release|Any CPU
40 | {9E64D23C-9C19-49DA-9903-12F151140AE7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
41 | {9E64D23C-9C19-49DA-9903-12F151140AE7}.Debug|Any CPU.Build.0 = Debug|Any CPU
42 | {9E64D23C-9C19-49DA-9903-12F151140AE7}.Debug|x64.ActiveCfg = Debug|x64
43 | {9E64D23C-9C19-49DA-9903-12F151140AE7}.Debug|x64.Build.0 = Debug|x64
44 | {9E64D23C-9C19-49DA-9903-12F151140AE7}.Release|Any CPU.ActiveCfg = Release|Any CPU
45 | {9E64D23C-9C19-49DA-9903-12F151140AE7}.Release|Any CPU.Build.0 = Release|Any CPU
46 | {9E64D23C-9C19-49DA-9903-12F151140AE7}.Release|x64.ActiveCfg = Release|x64
47 | {9E64D23C-9C19-49DA-9903-12F151140AE7}.Release|x64.Build.0 = Release|x64
48 | EndGlobalSection
49 | GlobalSection(SolutionProperties) = preSolution
50 | HideSolutionNode = FALSE
51 | EndGlobalSection
52 | EndGlobal
53 |
--------------------------------------------------------------------------------
/FastRDFStore/App.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/FastRDFStore/FastRDFStore.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.IO.Compression;
5 | using System.Linq;
6 | using System.Text;
7 |
8 | namespace FastRDFStore
9 | {
10 | public class FastRDFStore : IFastRDFStore
11 | {
12 | private const int pageSize = 1024*1024*1024; // 1GB
13 |
14 | private static Object datalock = new Object();
15 | private static bool initialized;
16 | private static Dictionary>> midToCompressedBlobLocation;
17 | private static List datapages;
18 | private static Dictionary>> largeMidsToCompressedBlobsLocations;
19 | private static Dictionary> cvtNodes;
20 | private static Dictionary namesTable;
21 | private static Dictionary predObjTypeTable;
22 | private static HashSet setGhostMid;
23 | private static Logger logger;
24 |
25 | private static string datadir;
26 |
27 | public static void Initialize(string datadirParam, string logFilename)
28 | {
29 | datadir = datadirParam;
30 |
31 | logger = new Logger(logFilename);
32 | logger.Log("Initializing FastRDFStore");
33 |
34 | try
35 | {
36 | if (!initialized)
37 | {
38 | lock (datalock)
39 | {
40 | if (!initialized)
41 | // avoid race condition with another thread also trying to initialize at the same time
42 | {
43 | LoadIndex();
44 | initialized = true;
45 | }
46 | }
47 | }
48 | }
49 | catch (Exception e)
50 | {
51 | // Log it, but rethrow so the caller knows the initialization failed.
52 | logger.LogException("Exception when initializing FastRDFStore", e);
53 | throw;
54 | }
55 |
56 | logger.Log("Done initializing");
57 | }
58 |
59 | public string[] GetOutboundPredicates(string subjectMid)
60 | {
61 | try
62 | {
63 | logger.Log("GetOutboundPredicates called for " + subjectMid);
64 | if (!initialized || subjectMid == null || subjectMid.Length < 2) return new string[] {};
65 |
66 | var results = GetPredicateObjectPairsForSubject(subjectMid);
67 |
68 | var adjoiningPredicates = results.AsParallel().Where(e => !e.Item1.StartsWith("wikipedia.") &&
69 | e.Item1 != "type.object.type" &&
70 | e.Item1 != "type.object.key" &&
71 | e.Item1 != "type.object.name" &&
72 | e.Item1 != "type.object.permission" &&
73 | e.Item1 != "common.topic.alias" &&
74 | e.Item1 != "common.topic.description" &&
75 | e.Item1 != "common.topic.image" &&
76 | !IsCVT(e.Item2)).Select(e => e.Item1);
77 | var adjPredCnt = adjoiningPredicates.GroupBy(x => x).ToDictionary(g => g.Key, g => g.Count());
78 |
79 |
80 | var cvtHopResults = results.AsParallel().Where(e => IsCVT(e.Item2) &&
81 | !e.Item1.StartsWith("wikipedia.") &&
82 | e.Item1 != "common.topic.article" &&
83 | e.Item1 != "common.topic.webpage" &&
84 | e.Item1 != "common.topic.description" &&
85 | e.Item1 != "common.document.text" &&
86 | e.Item1 != "common.topic.image");
87 | var cvtPredCnt = cvtHopResults.GroupBy(x => x.Item1).ToDictionary(g => g.Key, g => g.Count());
88 |
89 | // Check if there is any conflict and try to resolve it
90 | var conflictPreds = new HashSet(adjPredCnt.Keys.Intersect(cvtPredCnt.Keys));
91 | foreach(var cPred in conflictPreds)
92 | {
93 | if (adjPredCnt[cPred] > cvtPredCnt[cPred])
94 | cvtPredCnt.Remove(cPred);
95 | else if (adjPredCnt[cPred] < cvtPredCnt[cPred])
96 | adjPredCnt.Remove(cPred);
97 | logger.Log("Cannot resolve adj-cvt predicate conflict: " + subjectMid + " " + cPred);
98 | }
99 |
100 | var cvtHopPredicates = cvtHopResults.Where(e => cvtPredCnt.ContainsKey(e.Item1))
101 | .Select(e => new
102 | {
103 | predicate = e.Item1,
104 | cvt = e.Item2,
105 | cvtPredicates = GetPredicateObjectPairsForSubject(e.Item2).Select(pair => pair.Item1)
106 | .Where(predicate2 => predicate2 != "type.object.type" &&
107 | predicate2 != "type.object.key" &&
108 | predicate2 != "common.topic.description" &&
109 | predicate2 != "common.document.text").Distinct()
110 | })
111 | .SelectMany(e => e.cvtPredicates.Select(predicate2 => e.predicate + " " + predicate2));
112 |
113 | var allPredicates = adjoiningPredicates.Where(x => adjPredCnt.ContainsKey(x)).Union(cvtHopPredicates).OrderBy(e => e);
114 |
115 | return allPredicates.ToArray();
116 | }
117 | catch (Exception e)
118 | {
119 | logger.LogException("GetOutboundPredicates failed", e);
120 | return new string[] { };
121 | }
122 | }
123 |
124 | public string[] GetEntityNames(string[] entMids)
125 | {
126 | try
127 | {
128 | return entMids.Select(mid => namesTable.ContainsKey(mid) ? namesTable[mid] : "")
129 | .ToArray();
130 | }
131 | catch (Exception e)
132 | {
133 | logger.LogException("GetEntityNames failed", e);
134 | return new string[] { };
135 | }
136 | }
137 |
138 | // public method which doesn't return the dictionary of nodes in the graph.
139 | public SimpleFBObject GetSimpleObjectPredicatesAndCVTs(string subjectMid, int maxPerPredicate = int.MaxValue, bool followCVT = true)
140 | {
141 | try
142 | {
143 | logger.Log("GetSimpleObjectPredicatesAndCVTs called for "+subjectMid);
144 | Dictionary nodesInGraph;
145 | return GetSimpleObjectPredicatesAndCVTs(subjectMid, out nodesInGraph, maxPerPredicate, followCVT);
146 | }
147 | catch (Exception e)
148 | {
149 | logger.LogException("GetSimpleObjectPredicatesAndCVTs failed", e);
150 | return null;
151 | }
152 | }
153 |
154 | private SimpleFBObject GetSimpleObjectPredicatesAndCVTs(string subjectMid,
155 | out Dictionary nodesInGraph, int maxPerPredicate = int.MaxValue, bool followCVT = true)
156 | {
157 | SimpleFBObject myself = new SimpleFBObject();
158 | myself.Mid = subjectMid;
159 | myself.Name = GetName(subjectMid);
160 | Dictionary existingNodes = new Dictionary();
161 | existingNodes[subjectMid] = myself;
162 | myself.Objects = GetPredicatesAndNamedObjectsIncludingCVTs(existingNodes, subjectMid, maxPerPredicate,
163 | followCVT);
164 | nodesInGraph = existingNodes;
165 | return myself;
166 | }
167 |
168 | ///
169 | ///
170 | ///
171 | ///
172 | ///
173 | ///
174 | ///
175 | /// Predicates and objects hanging off of subjectMid. We guarantee that each predicate appears only once in the array
176 | private PredicateAndObjects[] GetPredicatesAndNamedObjectsIncludingCVTs(
177 | Dictionary existingNodes, string subjectMid, int maxPerPredicate = int.MaxValue,
178 | bool followCVT = true)
179 | {
180 | List> results = GetPredicateObjectPairsForSubject(subjectMid);
181 | Dictionary> predicatesToReturn = new Dictionary>();
182 | Dictionary predicateCountDict = new Dictionary();
183 | foreach (Tuple pair in results)
184 | {
185 | string predicate = pair.Item1;
186 | string obj = pair.Item2;
187 |
188 | // Check if the obj type is legit
189 | FBNodeType legitObjType, objType;
190 | objType = IsCVT(obj) ? FBNodeType.CVT : (IsEntity(obj) ? FBNodeType.Entity : FBNodeType.Value);
191 | if (predObjTypeTable.TryGetValue(predicate, out legitObjType) && objType != legitObjType)
192 | continue;
193 |
194 | // Check if obj is a ghost MID using the pre-compiled ghost MID table
195 | if (objType != FBNodeType.Value && setGhostMid.Contains(obj))
196 | continue;
197 |
198 | // Check if obj is a ghost MID if (1) it's not in the cvtNodes and (2) it does not have an entity name
199 | // This may happen because we do not index some tuples because of the predicates are excluded.
200 | if (IsEntity(obj) && !IsCVT(obj) && string.IsNullOrEmpty(GetName(obj)))
201 | continue;
202 |
203 | // Skip this predicate if we have added it maxPerPredicate times
204 | int predicateCount;
205 | predicateCountDict.TryGetValue(predicate, out predicateCount);
206 | // sets predicateCount to 0 if not in the dictionary
207 | if (predicateCount >= maxPerPredicate)
208 | continue; // Skip any more predicates, we've reached our max
209 | predicateCountDict[predicate] = predicateCount + 1;
210 |
211 | // Get the list of answers we're returning for this predicate
212 | List predicateObjects;
213 | if (!predicatesToReturn.TryGetValue(predicate, out predicateObjects))
214 | {
215 | predicateObjects = new List();
216 | predicatesToReturn[predicate] = predicateObjects;
217 | }
218 |
219 | if (objType == FBNodeType.Entity)
220 | {
221 | FBObject fbObject;
222 | if (!existingNodes.TryGetValue(obj, out fbObject))
223 | {
224 | SimpleFBObject simpleFBObject = new SimpleFBObject();
225 | simpleFBObject.Mid = obj;
226 | simpleFBObject.Name = GetName(obj);
227 | existingNodes[obj] = simpleFBObject;
228 | fbObject = simpleFBObject;
229 | }
230 | predicateObjects.Add(fbObject);
231 | }
232 | else if (objType == FBNodeType.Value)
233 | {
234 | ValueFBObject fbObject = new ValueFBObject();
235 | fbObject.Value = obj;
236 | predicateObjects.Add(fbObject);
237 | }
238 | else if (followCVT) // (objType == FBNodeType.CVT)
239 | {
240 | FBObject fbObject;
241 | if (!existingNodes.TryGetValue(obj, out fbObject))
242 | {
243 | CVTFBObject cvtFBObject = new CVTFBObject();
244 | cvtFBObject.Mid = obj;
245 | cvtFBObject.Objects = GetPredicatesAndNamedObjectsIncludingCVTs(existingNodes, obj,
246 | maxPerPredicate, false /* don't follow CVT nodes from this CVT node */);
247 | existingNodes[obj] = cvtFBObject;
248 | fbObject = cvtFBObject;
249 | }
250 | predicateObjects.Add(fbObject);
251 | }
252 | }
253 |
254 | // Convert to the return type (arrays instead of lists and dictionaries)
255 | return
256 | predicatesToReturn.Select(
257 | pair => new PredicateAndObjects() {Predicate = pair.Key, Objects = pair.Value.ToArray()}).ToArray();
258 | }
259 |
260 | private void FilterToSinglePredicate(SimpleFBObject node, string predicate)
261 | {
262 | PredicateAndObjects predicateAndObject = null;
263 | foreach (PredicateAndObjects p in node.Objects)
264 | {
265 | if (p.Predicate == predicate)
266 | {
267 | predicateAndObject = p;
268 | break;
269 | }
270 | }
271 | if (predicateAndObject == null)
272 | {
273 | // Didn't find the predicate
274 | node.Objects = new PredicateAndObjects[0];
275 | }
276 | node.Objects = new PredicateAndObjects[1] {predicateAndObject};
277 | }
278 |
279 |
280 | public SimpleFBObject GetSimpleObjectFilteredPredicateAndObjects(string subjectMid, string predicate)
281 | {
282 | try
283 | {
284 | logger.Log("GetSimpleObjectFilteredPredicateAndObjects called for subj=" + subjectMid + ", pred="+predicate);
285 |
286 | Dictionary nodesInGraph;
287 | SimpleFBObject initial = GetSimpleObjectPredicatesAndCVTs(subjectMid, out nodesInGraph, int.MaxValue, true);
288 |
289 | string[] predicateParts = predicate.Split(' ');
290 |
291 | if (predicateParts.Length < 1 || predicateParts.Length > 2) return null;
292 |
293 | FilterToSinglePredicate(initial, predicateParts[0]);
294 | if (initial.Objects.Length == 0)
295 | return initial; // Doesn't contain the desired predicate
296 | PredicateAndObjects predicateAndObjects = initial.Objects[0];
297 |
298 | if (predicateParts.Length == 2)
299 | {
300 | foreach (FBObject fbo in predicateAndObjects.Objects)
301 | {
302 | if (fbo is CVTFBObject)
303 | {
304 | foreach (PredicateAndObjects poi in (((CVTFBObject)fbo).Objects))
305 | {
306 | if (poi.Predicate == predicateParts[1])
307 | {
308 | foreach (FBObject fboObj in poi.Objects)
309 | {
310 | if (fboObj is SimpleFBObject)
311 | {
312 | SimpleFBObject fboAnswer = (SimpleFBObject)fboObj;
313 | if (fboAnswer.Objects == null)
314 | {
315 | // We need to expand the objects for this node
316 | PredicateAndObjects[] resultsForObj =
317 | GetPredicatesAndNamedObjectsIncludingCVTs(nodesInGraph, fboAnswer.Mid,
318 | int.MaxValue, false);
319 | fboAnswer.Objects = resultsForObj;
320 | }
321 | }
322 | }
323 | }
324 | }
325 | }
326 | }
327 |
328 | }
329 | else
330 | {
331 | foreach (FBObject fbo in predicateAndObjects.Objects)
332 | {
333 | if (fbo is SimpleFBObject)
334 | {
335 | SimpleFBObject fboAnswer = (SimpleFBObject)fbo;
336 | if (fboAnswer.Objects == null)
337 | {
338 | // We need to expand the objects for this node
339 | PredicateAndObjects[] resultsForObj = GetPredicatesAndNamedObjectsIncludingCVTs(
340 | nodesInGraph, fboAnswer.Mid, int.MaxValue, false);
341 | fboAnswer.Objects = resultsForObj;
342 | }
343 | }
344 | }
345 |
346 | }
347 | return initial;
348 | }
349 | catch (Exception e)
350 | {
351 | logger.LogException("GetSimpleObjectFilteredPredicateAndObjects failed", e);
352 | return null;
353 | }
354 | }
355 |
356 | public string[][] FindNodeSquencesOnPredicateChain(string startMid, string[] chainPredicates)
357 | {
358 | try
359 | {
360 | //logger.Log("FindNodeSquencesOnPredicateChain called for subj=" + startMid + ", chainPreds=" + string.Join(" ", chainPredicates));
361 |
362 | if (chainPredicates == null || chainPredicates.Length == 0)
363 | return null;
364 |
365 | var pred = chainPredicates[0]; // first predicate on the chain
366 | var objNodes = GetPredicateObjectPairsForSubject(startMid)
367 | .Where(x => x.Item1 == pred) // (predicate, object)
368 | .Select(x => x.Item2) // object only
369 | .ToArray();
370 | if (!objNodes.Any())
371 | return null;
372 |
373 | if (chainPredicates.Length == 1) // done
374 | return objNodes.Select(x => new[] {x}).ToArray();
375 |
376 | // more than one predicate in the chain
377 | var ret = new List();
378 | foreach (var node in objNodes)
379 | {
380 | var subSequences = FindNodeSquencesOnPredicateChain(node, chainPredicates.Skip(1).ToArray());
381 | if (subSequences == null) // cannot continue
382 | continue;
383 | ret.AddRange(subSequences.Select(seq => (new[] {node}).Concat(seq).ToArray()));
384 | }
385 | return ret.Any() ? ret.ToArray() : null;
386 | }
387 | catch (Exception e)
388 | {
389 | logger.LogException("FindNodeSquencesOnPredicateChain failed", e);
390 | return null;
391 | }
392 | }
393 |
394 | private string GetName(string mid)
395 | {
396 | if (mid == null)
397 | return null;
398 |
399 | string name;
400 | if (namesTable.TryGetValue(mid, out name))
401 | return name;
402 | else
403 | return null;
404 | }
405 |
406 | private bool IsCVT(string subject)
407 | {
408 | var key = GetSubjectKey(subject);
409 | if (cvtNodes.ContainsKey(key))
410 | {
411 | var dictionary = cvtNodes[key];
412 | return dictionary.ContainsKey(subject);
413 | }
414 | return false;
415 | }
416 |
417 | private bool IsEntity(string obj)
418 | {
419 | // We're missing this information in the compressed dataset.
420 | // For now, we'll do the following, but long-term consider fixing this to make it explicit in the dataset
421 | return obj.StartsWith("m.") || obj.StartsWith("g.") ||
422 | obj.StartsWith("en.") || !string.IsNullOrEmpty(GetName(obj)); // "en." is to support SEMPRE Freebase
423 | }
424 |
425 | private List> GetPredicateObjectPairsForSubject(string subject)
426 | {
427 | try
428 | {
429 | long offset;
430 | var compressedChunksLengths = new List();
431 | if (largeMidsToCompressedBlobsLocations.ContainsKey(subject))
432 | {
433 | var found = largeMidsToCompressedBlobsLocations[subject];
434 | offset = found.Item1;
435 | compressedChunksLengths.AddRange(found.Item2);
436 | }
437 | else
438 | {
439 | var partitionkey = GetSubjectKey(subject);
440 | var dictionary = midToCompressedBlobLocation[partitionkey];
441 | var compressedResultLocation = dictionary[subject];
442 | offset = compressedResultLocation.Item1;
443 | var length = compressedResultLocation.Item2;
444 | compressedChunksLengths.Add(length);
445 | }
446 |
447 | var toReturn = new List>();
448 | foreach (var length in compressedChunksLengths)
449 | {
450 | // does it span pages?
451 | var startPage = (int)(offset/pageSize);
452 | var endPage = (int)((offset + length - 1)/pageSize);
453 | byte[] compressedResult;
454 | int compressedResultIndex;
455 | int compressedResultCount;
456 | if (startPage == endPage)
457 | {
458 | compressedResult = datapages[(int)(offset/pageSize)];
459 | compressedResultIndex = (int)(offset%pageSize);
460 | compressedResultCount = length;
461 | }
462 | else
463 | {
464 | compressedResult = new byte[length];
465 | compressedResultIndex = 0;
466 | compressedResultCount = length;
467 | // first page
468 | int index = 0;
469 | for (int i = (int)(offset%pageSize); i < pageSize; i++)
470 | {
471 | compressedResult[index] = datapages[startPage][i];
472 | index++;
473 | }
474 |
475 | // intermediary pages
476 | for (int page = startPage + 1; page < endPage; page++)
477 | {
478 | for (int i = 0; i < pageSize; i++)
479 | {
480 | compressedResult[index] = datapages[page][i];
481 | index++;
482 | }
483 | }
484 |
485 | // last page
486 | for (int i = 0; i < (int)((offset + length)%pageSize); i++)
487 | {
488 | compressedResult[index] = datapages[endPage][i];
489 | index++;
490 | }
491 | }
492 |
493 | using (
494 | var memorystream = new MemoryStream(compressedResult, compressedResultIndex,
495 | compressedResultCount))
496 | {
497 | var gzipstream = new GZipStream(memorystream, CompressionMode.Decompress, false);
498 | var reader = new StreamReader(gzipstream, Encoding.Unicode);
499 | string line;
500 | while ((line = reader.ReadLine()) != null)
501 | {
502 | var split = line.Split('\t');
503 | if (split.Length == 2 && !string.IsNullOrEmpty(split[0]))
504 | toReturn.Add(new Tuple(split[0], split[1]));
505 | }
506 | }
507 |
508 | offset += length;
509 | }
510 |
511 | return toReturn;
512 | }
513 | catch (Exception e)
514 | {
515 | logger.LogException("GetPredicateObjectPairsForSubject failed", e);
516 | return new List>();
517 | }
518 | }
519 |
520 |
521 | private static string GetSubjectKey(string subject)
522 | {
523 | return (subject.StartsWith("m.") || subject.StartsWith("g."))
524 | ? ((subject.Length > 3)
525 | ? subject.Substring(0, 4)
526 | : (subject.Length > 2) ? subject.Substring(0, 3) : subject.Substring(0, 2))
527 | : (subject.Length > 1)
528 | ? subject.Substring(0, 2)
529 | : subject.Substring(0, 1);
530 | }
531 |
532 | private static void LoadIndex()
533 | {
534 | string midToOffsetPath = Path.Combine(datadir, "midToOffset.bin");
535 | string largeMidToOffsetPath = Path.Combine(datadir, "largeMidToOffset.bin");
536 | string datapagesPath = Path.Combine(datadir, "datapages.bin");
537 | string cvtNodesPath = Path.Combine(datadir, "cvtnodes.bin");
538 | string namesTablePath = Path.Combine(datadir, "namesTable.bin");
539 | string predicateObjTypePath = Path.Combine(datadir, "predicate.objtype.txt");
540 | string ghostMidPath = Path.Combine(datadir, "ghost_mid.txt");
541 |
542 | logger.Log("Reading the ghost MID table");
543 | setGhostMid = new HashSet(File.ReadAllLines(ghostMidPath));
544 |
545 | logger.Log("Reading the Predicate Objective Type table");
546 | predObjTypeTable = new Dictionary();
547 | foreach (var x in File.ReadLines(predicateObjTypePath)
548 | .Select(ln => ln.Split('\t'))
549 | .Select(
550 | f =>
551 | new
552 | {
553 | pred = f[0],
554 | valcnt = long.Parse(f[1]),
555 | entcnt = long.Parse(f[2]),
556 | cvtcnt = long.Parse(f[3])
557 | }))
558 | {
559 | if ((x.valcnt == 0 && x.entcnt == 0) ||
560 | (x.entcnt == 0 && x.cvtcnt == 0) ||
561 | (x.valcnt == 0 && x.cvtcnt == 0)) // no inconsistency in the data, skip
562 | continue;
563 |
564 | if (x.valcnt >= Math.Max(x.entcnt, x.cvtcnt))
565 | predObjTypeTable.Add(x.pred, FBNodeType.Value);
566 | else if (x.entcnt >= Math.Max(x.valcnt, x.cvtcnt))
567 | predObjTypeTable.Add(x.pred, FBNodeType.Entity);
568 | else
569 | predObjTypeTable.Add(x.pred, FBNodeType.CVT);
570 | }
571 |
572 | logger.Log("Reading names table");
573 | namesTable = DeserializeRelationTable(File.OpenRead(namesTablePath));
574 | logger.Log("Reading index");
575 | midToCompressedBlobLocation = Deserialize(File.OpenRead(midToOffsetPath));
576 | largeMidsToCompressedBlobsLocations = DeserializeSimple(File.OpenRead(largeMidToOffsetPath));
577 | cvtNodes = DeserializeCVTNodes(File.OpenRead(cvtNodesPath));
578 | datapages = new List();
579 |
580 | using (var binreader = new BinaryReader(File.OpenRead(datapagesPath)))
581 | {
582 | while (true)
583 | {
584 | var page = binreader.ReadBytes(pageSize);
585 | datapages.Add(page);
586 | if (page.Length < pageSize)
587 | break;
588 | }
589 | }
590 | }
591 |
592 |
593 | private static Dictionary>> Deserialize(Stream stream)
594 | {
595 | var reader = new BinaryReader(stream);
596 |
597 | var dictionariesCount = reader.ReadInt32();
598 | var toReturn = new Dictionary>>();
599 | for (int i = 0; i < dictionariesCount; i++)
600 | {
601 | var key = reader.ReadString();
602 | int count = reader.ReadInt32();
603 | var dictionary = new Dictionary>(count);
604 | for (int n = 0; n < count; n++)
605 | {
606 | var subject = reader.ReadString();
607 | var offset = reader.ReadInt64();
608 | var bytecount = reader.ReadInt32();
609 | dictionary.Add(subject, new Tuple(offset, bytecount));
610 | }
611 | toReturn.Add(key, dictionary);
612 | }
613 | return toReturn;
614 | }
615 |
616 | private static Dictionary> DeserializeCVTNodes(Stream stream)
617 | {
618 | var reader = new BinaryReader(stream);
619 |
620 | var dictionariesCount = reader.ReadInt32();
621 | var toReturn = new Dictionary>();
622 | for (int i = 0; i < dictionariesCount; i++)
623 | {
624 | var key = reader.ReadString();
625 | int count = reader.ReadInt32();
626 | var dictionary = new Dictionary(count);
627 | for (int n = 0; n < count; n++)
628 | {
629 | var mid = reader.ReadString();
630 | var isCVT = reader.ReadBoolean();
631 | dictionary.Add(mid, isCVT);
632 | }
633 | toReturn.Add(key, dictionary);
634 | }
635 | return toReturn;
636 | }
637 |
638 | private static Dictionary>> DeserializeSimple(Stream stream)
639 | {
640 | var reader = new BinaryReader(stream);
641 |
642 | var dictionaryCount = reader.ReadInt32();
643 | var toReturn = new Dictionary>>();
644 | for (int n = 0; n < dictionaryCount; n++)
645 | {
646 | var subject = reader.ReadString();
647 | var offset = reader.ReadInt64();
648 | var numCounts = reader.ReadInt32();
649 | var list = new List();
650 | for (int i = 0; i < numCounts; i++)
651 | list.Add(reader.ReadInt32());
652 | toReturn.Add(subject, new Tuple>(offset, list));
653 | }
654 | return toReturn;
655 | }
656 |
657 | private static Dictionary DeserializeRelationTable(Stream stream)
658 | {
659 | BinaryReader reader = new BinaryReader(stream);
660 | int dictionaryCount = reader.ReadInt32();
661 | Dictionary relationDictionary = new Dictionary(dictionaryCount);
662 | for (int i = 0; i < dictionaryCount; i++)
663 | {
664 | string key = reader.ReadString();
665 | string value = reader.ReadString();
666 | relationDictionary[key] = value;
667 | }
668 | return relationDictionary;
669 | }
670 | }
671 | }
--------------------------------------------------------------------------------
/FastRDFStore/FastRDFStore.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | AnyCPU
7 | {9E64D23C-9C19-49DA-9903-12F151140AE7}
8 | Exe
9 | Properties
10 | FastRDFStore
11 | FastRDFStore
12 | v4.5
13 | 512
14 | SAK
15 | SAK
16 | SAK
17 | SAK
18 |
19 |
20 | AnyCPU
21 | true
22 | full
23 | false
24 | bin\Debug\
25 | DEBUG;TRACE
26 | prompt
27 | 4
28 | false
29 |
30 |
31 | AnyCPU
32 | pdbonly
33 | true
34 | bin\Release\
35 | TRACE
36 | prompt
37 | 4
38 | false
39 |
40 |
41 | true
42 | bin\Debug\
43 | DEBUG;TRACE
44 | full
45 | x64
46 | prompt
47 | MinimumRecommendedRules.ruleset
48 | true
49 |
50 |
51 | bin\Release\
52 | TRACE
53 | true
54 | pdbonly
55 | x64
56 | prompt
57 | MinimumRecommendedRules.ruleset
58 | true
59 |
60 |
61 |
62 | ..\packages\CommandLineParser.1.9.71\lib\net45\CommandLine.dll
63 | True
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 | FreebaseCommonTypes.cs
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
100 |
--------------------------------------------------------------------------------
/FastRDFStore/FastRDFStore.xproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 14.0
5 | $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)
6 |
7 |
8 |
9 |
10 | 8db47aba-3494-448e-86e5-4926fb8d876a
11 | FastRDFStoreCore
12 | .\obj
13 | .\bin\
14 | v4.5.2
15 |
16 |
17 |
18 | 2.0
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/FastRDFStore/FreebaseCommonTypes.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.Serialization;
3 |
4 | namespace FastRDFStore
5 | {
6 |
7 | public enum FBNodeType
8 | {
9 | Value,
10 | Entity,
11 | CVT
12 | };
13 |
14 | [DataContract(IsReference = true)]
15 | public class PredicateAndObjects
16 | {
17 | [DataMember]
18 | public string Predicate { get; set; }
19 |
20 | [DataMember]
21 | public FBObject[] Objects { get; set; }
22 | }
23 |
24 | // A FBObject can either be:
25 | // - a simple value ("42"): ValueFBObject(value="42")
26 | // - an entity (Ireland): SimpleFBObject(mid="m.012wgb", name="Ireland")
27 | // - a CVT node (dated integer value) CVTFBObject
28 | [DataContract(IsReference = true)]
29 | [KnownType(typeof (ValueFBObject))]
30 | [KnownType(typeof (SimpleFBObject))]
31 | [KnownType(typeof (CVTFBObject))]
32 | public abstract class FBObject
33 | {
34 | public abstract string PrettyString();
35 | public abstract string GetNameOrValue();
36 |
37 | public virtual string GetMid() { return String.Empty; }
38 | }
39 |
40 | [DataContract(IsReference = true)]
41 | public class ValueFBObject : FBObject
42 | {
43 | [DataMember]
44 | public string Value { get; set; }
45 |
46 | public override string PrettyString() { return Value; }
47 | public override string GetNameOrValue() { return Value; }
48 | }
49 |
50 | [DataContract(IsReference = true)]
51 | public class SimpleFBObject : FBObject
52 | {
53 | [DataMember]
54 | public string Mid { get; set; }
55 |
56 | [DataMember]
57 | public string Name { get; set; }
58 |
59 | [DataMember]
60 | public PredicateAndObjects[] Objects { get; set; }
61 |
62 | // Guaranteed that each predicate appears only once. May be null
63 |
64 | public override string PrettyString() { return Name; }
65 | public override string GetNameOrValue() { return Name; }
66 | public override string GetMid() { return Mid; }
67 | }
68 |
69 | [DataContract(IsReference = true)]
70 | public class CVTFBObject : FBObject
71 | {
72 | [DataMember]
73 | public string Mid { get; set; } // mattri: Is this needed? If not used, could remove to save network traffic
74 |
75 | [DataMember]
76 | public PredicateAndObjects[] Objects { get; set; }
77 |
78 | // Guaranteed that each predicate appears only once. mattri: Can a CVT node have the same predicate coming off of it twice, with different objects? If not, replace with just an array of pairs
79 |
80 | public override string PrettyString() { return "[CVT " + Mid + "]"; }
81 | public override string GetNameOrValue() { return ""; }
82 | public override string GetMid() { return Mid; }
83 | }
84 |
85 |
86 | }
--------------------------------------------------------------------------------
/FastRDFStore/IFastRDFStore.cs:
--------------------------------------------------------------------------------
1 | using System.Runtime.Serialization;
2 | using System.ServiceModel;
3 |
4 | namespace FastRDFStore
5 | {
6 | [ServiceContract(Namespace = "urn:ps")]
7 | public interface IFastRDFStore
8 | {
9 | [OperationContract]
10 | string[] GetOutboundPredicates(string subjectMid);
11 |
12 | [OperationContract]
13 | string[] GetEntityNames(string[] entMids);
14 |
15 | [OperationContract]
16 | SimpleFBObject GetSimpleObjectPredicatesAndCVTs(string subjectMid, int maxPerPredicate, bool followCVT);
17 |
18 | [OperationContract]
19 | SimpleFBObject GetSimpleObjectFilteredPredicateAndObjects(string subjectMid, string predicate);
20 |
21 | [OperationContract]
22 | string[][] FindNodeSquencesOnPredicateChain(string startMid, string[] chainPredicates);
23 | }
24 | }
--------------------------------------------------------------------------------
/FastRDFStore/Logger.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.IO;
3 | using System.Text;
4 |
5 | namespace FastRDFStore
6 | {
7 | public class Logger
8 | {
9 | public enum Severity { INFO, WARNING, ERROR, EXCEPTION };
10 |
11 | public object outputLock = new object();
12 | private readonly string logFilename;
13 |
14 | public Logger(string logFilename) { this.logFilename = logFilename; }
15 |
16 | private static string EscapeNewlineAndTab(string s)
17 | {
18 | return s.Replace("\r\n", "\\n").Replace("\n\r", "\\n").Replace("\r", "\\n").Replace("\n", "\\n").Replace("\t", "\\t");
19 | }
20 |
21 | public void LogException(string message, Exception e)
22 | {
23 | string fullMessage = message + ". Exception info: " + e.ToString();
24 | if (e.InnerException != null)
25 | fullMessage += " *** With InnerException: " + e.InnerException.ToString();
26 | Log(fullMessage, Severity.EXCEPTION);
27 | }
28 |
29 | public void Log(string message, Severity severity = Severity.INFO)
30 | {
31 | if (string.IsNullOrWhiteSpace(logFilename)) // empty log file name -> skip logging
32 | return;
33 |
34 | lock (outputLock)
35 | {
36 | // Use "sortable" datetime for later log file processing convenience
37 | string line = DateTime.Now.ToString("s") + "\t" + severity + "\t" + EscapeNewlineAndTab(message);
38 | Console.WriteLine(line);
39 | File.AppendAllText(logFilename, line + Environment.NewLine);
40 | }
41 | }
42 |
43 | }
44 | }
--------------------------------------------------------------------------------
/FastRDFStore/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.ServiceModel;
3 | using System.Threading;
4 | using CommandLine;
5 |
6 | // This is the project that starts the FastRDFStore WCF service.
7 |
8 | namespace FastRDFStore
9 | {
10 | internal class CommandLineArguments
11 | {
12 | [Option('i', "idir", HelpText = "Directory containing *.bin files", DefaultValue = "")]
13 | public string idir { get; set; }
14 |
15 | [Option('s', "server", HelpText = "Server", DefaultValue = "localhost")]
16 | public string server { get; set; }
17 |
18 | [Option('p', "port", HelpText = "Connect to the FastRDFStore server on this port", DefaultValue = 9358)]
19 | public int port { get; set; }
20 |
21 | [Option('l', "log", HelpText = "Log file. Set to empty to disable logging", DefaultValue = "FastRDFStore.log")]
22 | public string logfile { get; set; }
23 |
24 | [ParserState]
25 | public IParserState LastParserState { get; set; }
26 |
27 | [HelpOption]
28 | public string GetUsage()
29 | {
30 | return CommandLine.Text.HelpText.AutoBuild(this,
31 | (CommandLine.Text.HelpText current) =>
32 | {
33 |
34 | current.Copyright = " ";
35 | current.AdditionalNewLineAfterOption = false;
36 | current.MaximumDisplayWidth = Console.WindowWidth;
37 | current.Heading = System.AppDomain.CurrentDomain.FriendlyName + " Usage:";
38 | CommandLine.Text.HelpText.DefaultParsingErrorsHandler(this, current);
39 | });
40 | }
41 | }
42 |
43 | internal class Program
44 | {
45 | private static void Main(string[] args)
46 | {
47 | CommandLineArguments cmd = new CommandLineArguments();
48 | Parser.Default.ParseArgumentsStrict(args, cmd);
49 |
50 | FastRDFStore.Initialize(cmd.idir, cmd.logfile);
51 |
52 | StartRDFStoreService(cmd.server, cmd.port);
53 |
54 | // Wait for user to hit CTRL-C
55 | Thread.Sleep(Timeout.Infinite);
56 | }
57 |
58 | public static void StartRDFStoreService(string server, int port)
59 | {
60 | var sh = new ServiceHost(typeof (FastRDFStore));
61 |
62 | var binding = new NetTcpBinding(SecurityMode.None)
63 | {
64 | MaxBufferSize = int.MaxValue,
65 | MaxBufferPoolSize = int.MaxValue,
66 | MaxReceivedMessageSize = int.MaxValue,
67 | ReceiveTimeout = TimeSpan.MaxValue,
68 | CloseTimeout = TimeSpan.MaxValue,
69 | TransferMode = TransferMode.Buffered
70 | };
71 |
72 | binding.ReaderQuotas.MaxDepth = int.MaxValue;
73 | //binding.MaxConnections = 5;
74 | //binding.ListenBacklog = 5;
75 |
76 | var endPointStringSolver = String.Format("net.tcp://{0}:{1}/solver", server, port);
77 | sh.AddServiceEndpoint(typeof (IFastRDFStore), binding, endPointStringSolver);
78 |
79 | sh.Open();
80 |
81 | }
82 | }
83 | }
--------------------------------------------------------------------------------
/FastRDFStore/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | // General Information about an assembly is controlled through the following
6 | // set of attributes. Change these attribute values to modify the information
7 | // associated with an assembly.
8 | [assembly: AssemblyTitle("FastRDFStore")]
9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("")]
12 | [assembly: AssemblyProduct("FastRDFStore")]
13 | [assembly: AssemblyCopyright("Copyright © 2015")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 |
17 | // Setting ComVisible to false makes the types in this assembly not visible
18 | // to COM components. If you need to access a type in this assembly from
19 | // COM, set the ComVisible attribute to true on that type.
20 | [assembly: ComVisible(false)]
21 |
22 | // The following GUID is for the ID of the typelib if this project is exposed to COM
23 | [assembly: Guid("19cfb8cc-7d2a-4259-be9e-76a1b4380955")]
24 |
25 | // Version information for an assembly consists of the following four values:
26 | //
27 | // Major Version
28 | // Minor Version
29 | // Build Number
30 | // Revision
31 | //
32 | // You can specify all the values or you can default the Build and Revision Numbers
33 | // by using the '*' as shown below:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("1.0.0.0")]
36 | [assembly: AssemblyFileVersion("1.0.0.0")]
37 |
--------------------------------------------------------------------------------
/FastRDFStore/deploy-debug.bat:
--------------------------------------------------------------------------------
1 | set VERSION=Debug
2 | xcopy /d /y bin\%VERSION%\* \\tspace10\e$\users\tmsnwork\runfb\bin
3 |
--------------------------------------------------------------------------------
/FastRDFStore/deploy.bat:
--------------------------------------------------------------------------------
1 | set VERSION=Release
2 | xcopy /d /y bin\%VERSION%\* \\tspace10\e$\users\tmsnwork\runfb\bin
3 |
--------------------------------------------------------------------------------
/FastRDFStore/packages.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/FastRDFStoreClient/App.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/FastRDFStoreClient/FastRDFStoreClient.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | AnyCPU
7 | {06F0469F-5ACC-4937-89AF-CD5436132C02}
8 | Exe
9 | Properties
10 | FastRDFStoreClient
11 | FastRDFStoreClient
12 | v4.5
13 | 512
14 | true
15 |
16 | SAK
17 | SAK
18 | SAK
19 | SAK
20 |
21 |
22 | AnyCPU
23 | true
24 | full
25 | false
26 | bin\Debug\
27 | TRACE;DEBUG
28 | prompt
29 | 4
30 | false
31 |
32 |
33 | AnyCPU
34 | pdbonly
35 | true
36 | bin\Release\
37 | TRACE
38 | prompt
39 | 4
40 | false
41 |
42 |
43 | true
44 | bin\x64\Debug\
45 | TRACE;DEBUG
46 | full
47 | x64
48 | prompt
49 | MinimumRecommendedRules.ruleset
50 | true
51 |
52 |
53 | bin\x64\Release\
54 | TRACE
55 | true
56 | pdbonly
57 | x64
58 | prompt
59 | MinimumRecommendedRules.ruleset
60 | true
61 |
62 |
63 |
64 | ..\packages\CommandLineParser.1.9.71\lib\net45\CommandLine.dll
65 | True
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 | IFastRDFStore.cs
81 |
82 |
83 | FreebaseCommonTypes.cs
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
100 |
--------------------------------------------------------------------------------
/FastRDFStoreClient/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 | using System.Threading.Tasks;
6 | using System.ServiceModel;
7 | using CommandLine;
8 | using FastRDFStore;
9 |
10 |
11 | ///
12 | /// A command line client to call FastRDFStore WCF service.
13 | ///
14 |
15 | namespace FastRDFStoreClient
16 | {
17 | internal class CommandLineArguments
18 | {
19 | // Connect to the RDFStore server at this location/port
20 | [Option('s', "server", HelpText = "Connect to the FastRDFStore server on this server [localhost]", DefaultValue = "localhost")]
21 | public string server { get; set; }
22 |
23 | [Option('p', "port", HelpText = "Connect to the FastRDFStore server on this port [9358]", DefaultValue = 9358)]
24 | public int port { get; set; }
25 |
26 | [Option('d', "dump", HelpText = "DumpMID")]
27 | public bool dump { get; set; }
28 |
29 | [Option('m', "mid", HelpText = "MID to search for")]
30 | public string mid { get; set; }
31 |
32 | [Option('t', "tripleOnly", HelpText = "Triple Only")]
33 | public bool tripleOnly { get; set; }
34 |
35 | [Option("pred", HelpText = "(optional) predicate for filtering")]
36 | public string predicate { get; set; }
37 |
38 | [Option('c', "chain", HelpText = "Predicate chain to search for")]
39 | public string predicateChain { get; set; }
40 |
41 | [ParserState]
42 | public IParserState LastParserState { get; set; }
43 |
44 | [HelpOption]
45 | public string GetUsage()
46 | {
47 | return CommandLine.Text.HelpText.AutoBuild(this,
48 | (CommandLine.Text.HelpText current) =>
49 | {
50 |
51 | current.Copyright = " ";
52 | current.AdditionalNewLineAfterOption = false;
53 | current.MaximumDisplayWidth = Console.WindowWidth;
54 | current.Heading = System.AppDomain.CurrentDomain.FriendlyName + " Usage:";
55 | CommandLine.Text.HelpText.DefaultParsingErrorsHandler(this, current);
56 | });
57 | }
58 | }
59 |
60 |
61 | internal class Program
62 | {
63 | private static void Main(string[] args)
64 | {
65 | CommandLineArguments cmd = new CommandLineArguments();
66 | Parser.Default.ParseArgumentsStrict(args, cmd);
67 |
68 | var binding = new NetTcpBinding(SecurityMode.None);
69 | binding.MaxBufferSize = int.MaxValue;
70 | binding.MaxBufferPoolSize = int.MaxValue;
71 | binding.MaxReceivedMessageSize = int.MaxValue;
72 |
73 | // Don't need identity because we're connecting without security. If we need security later, uncomment the following lines
74 | //EndpointIdentity identity = EndpointIdentity.CreateUpnIdentity(System.Security.Principal.WindowsIdentity.GetCurrent().Name);
75 | var myEndpoint = new EndpointAddress(new Uri("net.tcp://" + cmd.server + ":" + cmd.port + "/solver") /*, identity*/);
76 | var myChannelFactory = new ChannelFactory(binding, myEndpoint);
77 | IFastRDFStore fastRDFStoreClient = myChannelFactory.CreateChannel();
78 |
79 | Console.WriteLine("Endpoint connected to " + myEndpoint.Uri);
80 | do
81 | {
82 | if (cmd.mid != null && cmd.predicateChain != null)
83 | {
84 | var result = fastRDFStoreClient.FindNodeSquencesOnPredicateChain(cmd.mid, cmd.predicateChain.Split(' '));
85 | if (result == null)
86 | Console.WriteLine("Nothing is found.");
87 | else
88 | {
89 | foreach(var seq in result)
90 | Console.WriteLine(string.Join("\t", seq));
91 | }
92 | return;
93 | }
94 |
95 | if (cmd.mid != null && cmd.tripleOnly)
96 | {
97 | var result = fastRDFStoreClient.GetSimpleObjectPredicatesAndCVTs(cmd.mid, int.MaxValue, false);
98 |
99 | //var result = fastRDFStoreClient.GetPredObj(cmd.mid);
100 | if (result == null)
101 | Console.WriteLine("Nothing is found.");
102 | else
103 | {
104 | foreach (var po in result.Objects)
105 | {
106 | foreach (var node in po.Objects)
107 | {
108 | string type, val;
109 | if (node is ValueFBObject)
110 | {
111 | type = "Literal";
112 | val = node.GetNameOrValue();
113 | }
114 | else if (node is CVTFBObject)
115 | {
116 | type = "CVT";
117 | val = node.GetMid();
118 | }
119 | else // (node is SimpleFBObject)
120 | {
121 | type = "Entity";
122 | val = node.GetMid();
123 | }
124 |
125 | Console.WriteLine("{0}\t{1}\t{2}", po.Predicate, type, val);
126 | }
127 | }
128 | }
129 | return;
130 | }
131 |
132 | string subject = cmd.mid;
133 | string predicate = cmd.predicate;
134 |
135 | if (subject == null)
136 | {
137 | Console.WriteLine("Enter a Mid (m.06w2sn5) or hit enter to also enter a predicate.");
138 | Console.Write("Enter subject: ");
139 | subject = Console.ReadLine();
140 | }
141 |
142 | if (subject == "")
143 | {
144 | Console.WriteLine("First enter a Mid (m.06w2sn5) then a predicate (people.person.parents).");
145 | // Example predicate to try
146 | // m.06w2sn5 (Justin Bieber)
147 | // people.person.parents (a non CVT relationship)
148 | // people.person.sibling_s people.sibling_relationship.sibling (a CVT mediated relationship)
149 | // Or:
150 | // m.019nnl (Family Guy)
151 | // tv.tv_program.regular_cast tv.regular_tv_appearance.actor
152 |
153 | Console.Write("Enter subject: ");
154 | subject = Console.ReadLine();
155 | Console.Write("Enter predicate: ");
156 | predicate = Console.ReadLine();
157 | }
158 |
159 | var startTime = DateTime.Now;
160 | SimpleFBObject results;
161 | double retrieveSec;
162 |
163 | if (predicate == null)
164 | results = fastRDFStoreClient.GetSimpleObjectPredicatesAndCVTs(subject, int.MaxValue, true);
165 | else
166 | results = fastRDFStoreClient.GetSimpleObjectFilteredPredicateAndObjects(subject, predicate);
167 | retrieveSec = (DateTime.Now - startTime).TotalSeconds;
168 |
169 | HashSet alreadyOutput = new HashSet();
170 | alreadyOutput.Add(results.Mid);
171 | foreach (PredicateAndObjects predAndObjs in results.Objects)
172 | {
173 | if (cmd.dump)
174 | OutputMids(predAndObjs, alreadyOutput);
175 | else
176 | OutputPredicateAndObjects(predAndObjs, alreadyOutput);
177 | }
178 |
179 | Console.WriteLine("Took " + retrieveSec + " seconds to retrieve results ");
180 | Console.WriteLine();
181 | } while (cmd.mid == null); // Loop forever if using console input
182 | }
183 |
184 | private static void OutputMids(PredicateAndObjects predAndObjects, HashSet alreadyOutput = null)
185 | {
186 | if (alreadyOutput == null)
187 | alreadyOutput = new HashSet();
188 | foreach (FBObject fbObj in predAndObjects.Objects)
189 | {
190 | if (fbObj is SimpleFBObject)
191 | {
192 | SimpleFBObject simpleFBObj = fbObj as SimpleFBObject;
193 | Console.WriteLine(simpleFBObj.Mid);
194 | }
195 | else if (fbObj is CVTFBObject)
196 | {
197 | CVTFBObject cvtObj = fbObj as CVTFBObject;
198 | Console.WriteLine(cvtObj.Mid);
199 | if (!alreadyOutput.Contains(cvtObj.Mid))
200 | {
201 | alreadyOutput.Add(cvtObj.Mid);
202 | foreach (PredicateAndObjects cvtObject in cvtObj.Objects)
203 | OutputMids(cvtObject, alreadyOutput);
204 | }
205 | }
206 | }
207 | }
208 |
209 | private static void OutputPredicateAndObjects(PredicateAndObjects predAndObjects, HashSet alreadyOutput = null, int indent = 0)
210 | {
211 | if (alreadyOutput == null)
212 | alreadyOutput = new HashSet();
213 |
214 | if (indent != 0)
215 | Console.Write(new string(' ', indent));
216 | Console.Write(string.Format("{0,-40} --> ", predAndObjects.Predicate));
217 |
218 | bool newlineWritten = false;
219 | foreach (FBObject fbObj in predAndObjects.Objects)
220 | {
221 | if (newlineWritten)
222 | Console.Write(string.Format("{0,-40} --> ", ""));
223 | if (fbObj is ValueFBObject)
224 | {
225 | Console.WriteLine((fbObj as ValueFBObject).Value);
226 | }
227 | else if (fbObj is SimpleFBObject)
228 | {
229 | SimpleFBObject simpleFBObj = fbObj as SimpleFBObject;
230 | Console.WriteLine((simpleFBObj.Name ?? "[no name]") + " (" + simpleFBObj.Mid + ")");
231 | if (simpleFBObj.Objects != null)
232 | {
233 | if (!alreadyOutput.Contains(simpleFBObj.Mid))
234 | {
235 | alreadyOutput.Add(simpleFBObj.Mid);
236 | foreach (PredicateAndObjects Object in simpleFBObj.Objects)
237 | OutputPredicateAndObjects(Object, alreadyOutput, indent + 4);
238 | }
239 | }
240 | }
241 | else if (fbObj is CVTFBObject)
242 | {
243 | CVTFBObject cvtObj = fbObj as CVTFBObject;
244 | Console.WriteLine("CVT (" + cvtObj.Mid + ")");
245 | if (!alreadyOutput.Contains(cvtObj.Mid))
246 | {
247 | alreadyOutput.Add(cvtObj.Mid);
248 | foreach (PredicateAndObjects cvtObject in cvtObj.Objects)
249 | OutputPredicateAndObjects(cvtObject, alreadyOutput, indent + 4);
250 | }
251 | }
252 | else
253 | {
254 | Console.WriteLine("[Unknown object: " + fbObj);
255 | }
256 | newlineWritten = true;
257 | }
258 | }
259 | }
260 | }
--------------------------------------------------------------------------------
/FastRDFStoreClient/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | // General Information about an assembly is controlled through the following
6 | // set of attributes. Change these attribute values to modify the information
7 | // associated with an assembly.
8 | [assembly: AssemblyTitle("FastRDFStoreClient")]
9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("")]
12 | [assembly: AssemblyProduct("FastRDFStoreClient")]
13 | [assembly: AssemblyCopyright("Copyright © 2015")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 |
17 | // Setting ComVisible to false makes the types in this assembly not visible
18 | // to COM components. If you need to access a type in this assembly from
19 | // COM, set the ComVisible attribute to true on that type.
20 | [assembly: ComVisible(false)]
21 |
22 | // The following GUID is for the ID of the typelib if this project is exposed to COM
23 | [assembly: Guid("06f0469f-5acc-4937-89af-cd5436132c02")]
24 |
25 | // Version information for an assembly consists of the following four values:
26 | //
27 | // Major Version
28 | // Minor Version
29 | // Build Number
30 | // Revision
31 | //
32 | // You can specify all the values or you can default the Build and Revision Numbers
33 | // by using the '*' as shown below:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("1.0.0.0")]
36 | [assembly: AssemblyFileVersion("1.0.0.0")]
37 |
--------------------------------------------------------------------------------
/FastRDFStoreClient/packages.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/FreebaseToRDFStore/FreebaseToRDFStore.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | AnyCPU
7 | {DFE0B2DA-CDB9-4B5C-8472-E04AD2E9C9EC}
8 | Exe
9 | Properties
10 | FreebaseToRDFStore
11 | FreebaseToRDFStore
12 | v4.5
13 | 512
14 | SAK
15 | SAK
16 | SAK
17 | SAK
18 |
19 |
20 |
21 | AnyCPU
22 | true
23 | full
24 | false
25 | bin\Debug\
26 | DEBUG;TRACE
27 | prompt
28 | 4
29 | false
30 |
31 |
32 | AnyCPU
33 | pdbonly
34 | true
35 | bin\Release\
36 | TRACE
37 | prompt
38 | 4
39 | false
40 |
41 |
42 | true
43 | bin\x64\Debug\
44 | DEBUG;TRACE
45 | full
46 | x64
47 | prompt
48 | MinimumRecommendedRules.ruleset
49 | false
50 |
51 |
52 | bin\x64\Release\
53 | TRACE
54 | true
55 | pdbonly
56 | x64
57 | prompt
58 | MinimumRecommendedRules.ruleset
59 | false
60 |
61 |
62 |
63 | ..\packages\CommandLineParser.1.9.71\lib\net45\CommandLine.dll
64 | True
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
90 |
--------------------------------------------------------------------------------
/FreebaseToRDFStore/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.IO.Compression;
5 | using System.Linq;
6 | using System.Text;
7 | using CommandLine;
8 |
9 | namespace FreebaseToRDFStore
10 | {
11 | internal enum Command
12 | {
13 | TrimData,
14 | BuildStore,
15 | FindGhost
16 | }
17 |
18 | internal class CommandLineArguments
19 | {
20 | [Option('c', "cmd", Required = true,
21 | HelpText = "Run TrimData first, then BuildStore, then FindGhost. TrimData trims the raw freebase RDF dump and outputs the filtered fb_en*. " +
22 | "BuildStore reads the filtered triples fb_en* and outputs the in-memory store (*.bin file). " +
23 | "Use FindGhost (find ghost object nodes) in the end.",
24 | DefaultValue = Command.BuildStore)]
25 | public Command cmd { get; set; }
26 |
27 | [Option('i', "idir", HelpText = "Input directory for reading files", DefaultValue = "")]
28 | public string idir { get; set; }
29 |
30 | [Option('o', "odir", HelpText = "Output directory for writing files", DefaultValue = "")]
31 | public string odir { get; set; }
32 |
33 | [ParserState]
34 | public IParserState LastParserState { get; set; }
35 |
36 | [HelpOption]
37 | public string GetUsage()
38 | {
39 | return CommandLine.Text.HelpText.AutoBuild(this,
40 | (CommandLine.Text.HelpText current) =>
41 | {
42 |
43 | current.Copyright = " ";
44 | current.AdditionalNewLineAfterOption = false;
45 | current.MaximumDisplayWidth = Console.WindowWidth;
46 | current.Heading = System.AppDomain.CurrentDomain.FriendlyName + " Usage:";
47 | CommandLine.Text.HelpText.DefaultParsingErrorsHandler(this, current);
48 | });
49 | }
50 |
51 | }
52 |
53 | internal class Program
54 | {
55 | private const int maxBuilderLengthInChars = 250*1024*1024; // 500MB (bytes not chars)
56 |
57 | private static void Main(string[] args)
58 | {
59 | CommandLineArguments cmd = new CommandLineArguments();
60 | Parser.Default.ParseArgumentsStrict(args, cmd);
61 |
62 | if (cmd.cmd == Command.TrimData)
63 | TrimData(cmd);
64 | else if (cmd.cmd == Command.BuildStore)
65 | {
66 | BuildStore(cmd);
67 | }
68 | else if (cmd.cmd == Command.FindGhost)
69 | {
70 | FindGhost(cmd);
71 | }
72 |
73 | Console.WriteLine("Done");
74 | }
75 |
76 | public static void BuildStore(CommandLineArguments cmd)
77 | {
78 | string midToOffsetFilename = Path.Combine(cmd.odir, "midToOffset.bin");
79 | string largeMidToOffsetFilename = Path.Combine(cmd.odir, "largeMidToOffset.bin");
80 | string datapagesFilename = Path.Combine(cmd.odir, "datapages.bin");
81 | string cvtNodesFilename = Path.Combine(cmd.odir, "cvtnodes.bin");
82 | string namesTableFilename = Path.Combine(cmd.odir, "namesTable.bin");
83 |
84 | // Ensure we can write to the output files before starting
85 | (new StreamWriter(midToOffsetFilename)).Close();
86 | (new StreamWriter(largeMidToOffsetFilename)).Close();
87 | (new StreamWriter(datapagesFilename)).Close();
88 | (new StreamWriter(cvtNodesFilename)).Close();
89 | (new StreamWriter(namesTableFilename)).Close();
90 |
91 | var ifile1 = Path.Combine(cmd.idir, "fb_en.txt");
92 | var ifile2 = Path.Combine(cmd.idir, "fb_en_nonM.txt");
93 |
94 | // SY: This is the main data structure that recrods the data location.
95 | // Basically, given an subject MID, it records where in the binary (compressed) file contains the group of tuples.
96 | // key -> { (subject, (start_position, length) }
97 | // key is some prefix of the subject MID, designed for breaking a large group (some MID subject has a lot of tuples)
98 | // subject: the MID of the subject
99 | // start_position: the starting position of the group in the binary (compressed) file.
100 | // length: the length of the group in the binary (compressed) file
101 | var midToOffsetDictionaries = new Dictionary>>();
102 |
103 | // SY: If an MID group is too large and has been broken into several parts, then this dictionary stores the starting position, and the size of each part
104 | // subject -> (start_position, [size1, size2, size3, ...])
105 | var largeMidsToCompressedBlobsLocations = new Dictionary>>();
106 |
107 | // SY: A table to store whether a subject is a CVT node or not. The data structure is similar to midToOffsetDictionaries, although I think a regular Dictionary should work fine.
108 | // key -> { (subject, isCVT }
109 | var cvtNodes = new Dictionary>();
110 |
111 | // SY: The table to store entity names.
112 | Dictionary namesTable = new Dictionary();
113 |
114 | long currentOffset = 0;
115 | long curLines = 0; // counter for the number of lines
116 | var currentMidCounts = 0.0;
117 | var avgUncompressedSize = 0.0;
118 | var avgCompressedSize = 0.0;
119 |
120 | // SY: Predicates that will be excluded from the index.
121 | var excludedDomains = new[] {"authority", "imdb", "internet", "source"}.Select(x => x + ".");
122 | var excludedPredicates = new[]
123 | {
124 | "type.object.key",
125 | "type.object.permission",
126 | "common.topic.image",
127 |
128 | "common.topic.topic_equivalent_webpage",
129 | "common.topic.topical_webpage",
130 | "en",
131 | "base.ranker.rankerurlname",
132 |
133 | "type.object.type",
134 | "common.topic.description"
135 | };
136 |
137 |
138 | using (var datapagesWriter = new BinaryWriter(File.OpenWrite(datapagesFilename)))
139 | {
140 | var lastSubject = "";
141 | var builder = new StringBuilder();
142 | var multipleCompressedLengths = new List();
143 | long totalRawBytes = 0;
144 | bool wroteMidToConsole = false;
145 | bool isCVT = true;
146 |
147 | // SY: Adding the dummy line in the end to make sure that the final group is indexed.
148 | foreach (var line in File.ReadLines(ifile1).Concat(File.ReadLines(ifile2)).Concat(new string[] {"dummy\tdummy\tdummy"}))
149 | {
150 | try
151 | {
152 | curLines++;
153 | var parts = line.Split('\t');
154 |
155 | if (excludedPredicates.Contains(parts[1]) || excludedDomains.Any(pre => parts[1].StartsWith(pre)))
156 | continue;
157 |
158 | // SY: Assuming the tuples are grouped by the MIDs of the subject field
159 | var subject = parts[0];
160 | if (subject == lastSubject || lastSubject == "") // SY: still the same group of tuples, or just the first line
161 | {
162 | // SY: Append the predicate and object to the string builder (for this subject)
163 | builder.Append(parts[1]);
164 | builder.Append("\t");
165 | builder.Append(parts[2]);
166 | builder.AppendLine();
167 |
168 | // SY: Use the existence of the entity name as the indication for whether this subject is a CVT
169 | if (parts[1] == "type.object.name")
170 | {
171 | isCVT = false;
172 | // SY: if an entity has more than one name, this table will only store the last one.
173 | namesTable[parts[0]] = parts[2];
174 | }
175 |
176 | // SY: If this group is too large, break it
177 | if (builder.Length > maxBuilderLengthInChars)
178 | {
179 | long rawBytes;
180 | var compressedBytesCount = CompressAndSave(builder, datapagesWriter, out rawBytes);
181 | multipleCompressedLengths.Add(compressedBytesCount);
182 | totalRawBytes += rawBytes;
183 | if (!wroteMidToConsole)
184 | {
185 | Console.WriteLine();
186 | Console.WriteLine("Large Mid: " + subject);
187 | wroteMidToConsole = true;
188 | }
189 | }
190 |
191 | // SY: for the first line only
192 | if (lastSubject == "") lastSubject = subject;
193 | }
194 | else // SY: Output the data of this subject group
195 | {
196 | // SY: compress and save the unsaved string builder content first, unless multipleCompressedLengths.Count > 0 && builder.Length == 0
197 | if (!multipleCompressedLengths.Any() || builder.Length > 0)
198 | {
199 | long rawBytes;
200 | var bytesCount = CompressAndSave(builder, datapagesWriter, out rawBytes);
201 | totalRawBytes += rawBytes;
202 | multipleCompressedLengths.Add(bytesCount);
203 | }
204 |
205 | // SY: Total size of the compressed data
206 | var compressedBytesCount = multipleCompressedLengths.Select(e => (long)e).Sum();
207 |
208 | // SY: Initialize the dictionary for the next subject group
209 | var newKey = GetSubjectKey(subject);
210 | if (!midToOffsetDictionaries.ContainsKey(newKey))
211 | midToOffsetDictionaries.Add(newKey, new Dictionary>());
212 |
213 | #region Save the previous block
214 |
215 | var key = GetSubjectKey(lastSubject);
216 |
217 | // SY: Add the position and length of the group of "lastSubject" in the offset dictionary
218 | if (midToOffsetDictionaries[key].ContainsKey(lastSubject))
219 | throw new Exception("Duplicate runs for mid " + lastSubject + ", line: " + line);
220 | midToOffsetDictionaries[key].Add(lastSubject, new Tuple(currentOffset, (int)Math.Min(compressedBytesCount, int.MaxValue)));
221 |
222 | if (isCVT)
223 | {
224 | if (!cvtNodes.ContainsKey(key))
225 | cvtNodes.Add(key, new Dictionary());
226 | if (!cvtNodes[key].ContainsKey(lastSubject))
227 | cvtNodes[key][lastSubject] = true;
228 | }
229 |
230 | if (multipleCompressedLengths.Count > 1)
231 | {
232 | largeMidsToCompressedBlobsLocations.Add(lastSubject, new Tuple>(currentOffset, multipleCompressedLengths.ToArray().ToList()));
233 | }
234 |
235 | multipleCompressedLengths.Clear();
236 |
237 | #endregion
238 |
239 | // reset "lastSubject" to the current subject and other variables
240 | lastSubject = subject;
241 | currentOffset += compressedBytesCount;
242 | isCVT = true;
243 | wroteMidToConsole = false;
244 |
245 | // don't forget to process the current line (with the new subject), now that the builder has been cleared and isCVT has been reset
246 | builder.Append(parts[1]);
247 | builder.Append("\t");
248 | builder.Append(parts[2]);
249 | builder.AppendLine();
250 |
251 | if (parts[1] == "type.object.name")
252 | {
253 | isCVT = false;
254 | // SY: if an entity has more than one name, this table will only store the last one.
255 | namesTable[parts[0]] = parts[2];
256 | }
257 |
258 | #region Update status information for print out
259 |
260 | avgUncompressedSize = (avgUncompressedSize*currentMidCounts + totalRawBytes)/(currentMidCounts + 1.0);
261 | avgCompressedSize = (avgCompressedSize*currentMidCounts + compressedBytesCount)/(currentMidCounts + 1.0);
262 | totalRawBytes = 0;
263 | currentMidCounts += 1.0;
264 |
265 | if (((long)currentMidCounts)%10000 == 0)
266 | Console.Write(".");
267 |
268 | if (((long)currentMidCounts)%1000000 == 0)
269 | {
270 | Console.WriteLine();
271 | Console.WriteLine("" + curLines + " lines, " + ((double)currentMidCounts)/1000000.0 + "Million mids, " + avgCompressedSize + " compAvg, " + avgUncompressedSize + " uncompAvg" +
272 | ", size read = " + avgUncompressedSize*currentMidCounts/(1024*1024*1024) + " GB");
273 | GC.Collect();
274 | }
275 |
276 | #endregion
277 | }
278 | }
279 | catch (Exception e)
280 | {
281 | Console.WriteLine(e);
282 | break;
283 | }
284 | }
285 |
286 | datapagesWriter.Close();
287 | }
288 |
289 | using (var dictionaryStream = File.OpenWrite(midToOffsetFilename))
290 | {
291 | Serialize(midToOffsetDictionaries, dictionaryStream);
292 | dictionaryStream.Close();
293 | }
294 |
295 | using (var largeDictionaryStream = File.OpenWrite(largeMidToOffsetFilename))
296 | {
297 | SerializeSimple(largeMidsToCompressedBlobsLocations, largeDictionaryStream);
298 | largeDictionaryStream.Close();
299 | }
300 |
301 | using (var cvtNodesStream = File.OpenWrite(cvtNodesFilename))
302 | {
303 | SerializeCVTNodes(cvtNodes, cvtNodesStream);
304 | cvtNodesStream.Close();
305 | }
306 |
307 | using (var namesTableStream = File.OpenWrite(namesTableFilename))
308 | {
309 | SerializeRelationTable(namesTable, namesTableStream);
310 | namesTableStream.Close();
311 | }
312 | }
313 |
314 |
315 | public static int CompressAndSave(StringBuilder builder, BinaryWriter datapagesWriter, out long rawBytes)
316 | {
317 | var dataAsString = builder.ToString();
318 | builder.Clear();
319 | byte[] rawbytes = new byte[dataAsString.Length*sizeof (char)];
320 | Buffer.BlockCopy(dataAsString.ToCharArray(), 0, rawbytes, 0, rawbytes.Length);
321 | byte[] compressedBytes;
322 |
323 | using (var memorystream = new MemoryStream())
324 | {
325 | var gzipstream = new GZipStream(memorystream, CompressionMode.Compress, false);
326 | gzipstream.Write(rawbytes, 0, rawbytes.Length);
327 | gzipstream.Close();
328 | compressedBytes = memorystream.ToArray();
329 | memorystream.Close();
330 | }
331 |
332 | datapagesWriter.Write(compressedBytes);
333 | rawBytes = rawbytes.Length;
334 | return compressedBytes.Length;
335 | }
336 |
337 | public static string GetSubjectKey(string subject)
338 | {
339 | // if subject starts from "m." or "g.", then use the first 4 characters as key by default
340 | // otherwise, use the whole subject
341 | // if subject starts from neither "m." nor "g.", then use the first 2 characters as key by default;
342 | // if subject has only 1 character, then use it as key directly
343 |
344 | return (subject.StartsWith("m.") || subject.StartsWith("g.")) ?
345 | ((subject.Length > 3) ?
346 | subject.Substring(0, 4) :
347 | (subject.Length > 2) ? subject.Substring(0, 3) : subject.Substring(0, 2)) :
348 | (subject.Length > 1) ? subject.Substring(0, 2) : subject.Substring(0, 1);
349 | }
350 |
351 | public static void SerializeCVTNodes(Dictionary> cvtnodes, Stream stream)
352 | {
353 | var writer = new BinaryWriter(stream);
354 |
355 | writer.Write(cvtnodes.Count);
356 | foreach (var key in cvtnodes.Keys)
357 | {
358 | writer.Write(key);
359 | var dictionary = cvtnodes[key];
360 | writer.Write(dictionary.Count);
361 | foreach (var kvp in dictionary)
362 | {
363 | writer.Write(kvp.Key);
364 | writer.Write(kvp.Value);
365 | }
366 | }
367 | writer.Flush();
368 |
369 | }
370 |
371 | public static void Serialize(Dictionary>> dictionaries, Stream stream)
372 | {
373 | var writer = new BinaryWriter(stream);
374 |
375 | writer.Write(dictionaries.Count);
376 | foreach (var key in dictionaries.Keys)
377 | {
378 | writer.Write(key);
379 | var dictionary = dictionaries[key];
380 | writer.Write(dictionary.Count);
381 | foreach (var kvp in dictionary)
382 | {
383 | writer.Write(kvp.Key);
384 | writer.Write(kvp.Value.Item1);
385 | writer.Write(kvp.Value.Item2);
386 | }
387 | }
388 | writer.Flush();
389 | }
390 |
391 | public static void SerializeSimple(Dictionary>> dictionary, Stream stream)
392 | {
393 | var writer = new BinaryWriter(stream);
394 |
395 | writer.Write(dictionary.Count);
396 | foreach (var key in dictionary.Keys)
397 | {
398 | writer.Write(key);
399 | var offset = dictionary[key].Item1;
400 | var list = dictionary[key].Item2;
401 | writer.Write(offset);
402 | writer.Write(list.Count());
403 | foreach (var val in list)
404 | {
405 | writer.Write(val);
406 | }
407 | }
408 | writer.Flush();
409 | }
410 |
411 | public static void SerializeRelationTable(Dictionary dictionary, Stream stream)
412 | {
413 | BinaryWriter writer = new BinaryWriter(stream);
414 | writer.Write(dictionary.Count);
415 | foreach (KeyValuePair pair in dictionary)
416 | {
417 | writer.Write(pair.Key);
418 | writer.Write(pair.Value);
419 | }
420 | writer.Flush();
421 | }
422 |
423 | public static void TrimData(CommandLineArguments cmd)
424 | {
425 | var inputFBFile = Path.Combine(cmd.idir, "freebase-rdf-latest");
426 | var outputEnglishFile = Path.Combine(cmd.odir, "fb_en.txt");
427 | var outputEnglishNonMFile = Path.Combine(cmd.odir, "fb_en_nonM.txt");
428 | var outputConsoleFilename = Path.Combine(cmd.odir, "fb_console.txt");
429 |
430 |
431 | var totalTriples = 0;
432 |
433 | var moreThan4PartsCount = 0;
434 |
435 | var subjectFirstTwoLettersHistogram = new Dictionary();
436 |
437 | var beforePredicateHistogram = new Dictionary();
438 |
439 | var objectIdsPrefixHistogram = new Dictionary();
440 | var objectValueCount = 0;
441 | var objectValueNoLangIdCount = 0;
442 | var objectValueEnglishLangIdCount = 0;
443 | var objectValueForeignLangIdCount = 0;
444 |
445 | // predicates to be removed
446 | HashSet hsRemovedPred = new HashSet(new[]
447 | {
448 | "22-rdf-syntax-ns#type", "type.object.key", "rdf-schema#label", "type.object.permission",
449 | "type.user.usergroup", "type.usergroup.member", "type.user.userid", "type.permission.controls", "user"
450 | });
451 | string[] lstRemovedPredPre = {"user.", "wikipedia.", "dataworld."};
452 |
453 | // Store the entity names even when there is no "en" language
454 | Dictionary dtLang2Name = new Dictionary();
455 | string[] langOrder = {"en", "en-US", "en-GB", "en-CA", "en-Dsrt"};
456 | string lastSubj = "";
457 | string lang = "";
458 |
459 | try
460 | {
461 | using (FileStream originalFileStream = File.OpenRead(inputFBFile))
462 | {
463 | var reader = new StreamReader(originalFileStream);
464 | using (var writerDecompressedEnglish = new StreamWriter(outputEnglishFile))
465 | {
466 | using (var writernonM = new StreamWriter(outputEnglishNonMFile))
467 | {
468 | using (var writerConsole = new StreamWriter(outputConsoleFilename))
469 | {
470 | string line;
471 | bool blExistEn;
472 | while ((line = reader.ReadLine()) != null)
473 | {
474 | if (totalTriples%100000 == 0)
475 | Console.Write(".");
476 | if (totalTriples%10000000 == 0)
477 | Console.WriteLine();
478 | totalTriples++;
479 | var parts = line.Split('\t');
480 |
481 | if (parts.Count() != 4)
482 | {
483 | moreThan4PartsCount++;
484 | continue;
485 | }
486 |
487 | var subjectStart = parts[0].LastIndexOf('/') + 1;
488 | var subjectEnd = parts[0].LastIndexOf('>');
489 | var subject = parts[0].Substring(subjectStart, subjectEnd - subjectStart);
490 | var subjectPrefix = subject.Substring(0, (subject.Length > 1 ? 2 : 1));
491 | if (!subjectFirstTwoLettersHistogram.ContainsKey(subjectPrefix))
492 | subjectFirstTwoLettersHistogram.Add(subjectPrefix, 1);
493 | else
494 | subjectFirstTwoLettersHistogram[subjectPrefix]++;
495 |
496 |
497 | var predicateStart = parts[1].LastIndexOf('/') + 1;
498 | var predicateEnd = parts[1].LastIndexOf('>');
499 | var predicate = parts[1].Substring(predicateStart, predicateEnd - predicateStart);
500 |
501 | // Check if we want to remove this predicate
502 | if (hsRemovedPred.Contains(predicate))
503 | continue;
504 | bool blRemove = false;
505 | foreach (var prefix in lstRemovedPredPre)
506 | {
507 | if (predicate.StartsWith(prefix))
508 | {
509 | blRemove = true;
510 | break;
511 | }
512 | }
513 | if (blRemove)
514 | continue;
515 |
516 | var beforePredicate = parts[1].Substring(0, predicateStart);
517 | if (!beforePredicateHistogram.ContainsKey(beforePredicate))
518 | beforePredicateHistogram.Add(beforePredicate, 1);
519 | else
520 | beforePredicateHistogram[beforePredicate]++;
521 |
522 | int objectStart, objectEnd;
523 | string objectG;
524 | if (parts[2][0] != '"') // mid or predicate
525 | {
526 | //
527 | //
528 | //
529 |
530 |
531 | objectStart = parts[2].LastIndexOf('/') + 1;
532 | if (parts[2][objectStart] == '>')
533 | {
534 | // this case: //
535 | objectStart = parts[2].Substring(0, objectStart - 1).LastIndexOf('/') + 1;
536 | }
537 | objectEnd = parts[2].LastIndexOf('>');
538 | objectG = parts[2].Substring(objectStart, objectEnd - objectStart);
539 |
540 | var objectPrefix = objectG.Substring(0, (objectG.Length > 1 ? 2 : 1));
541 | if (!objectIdsPrefixHistogram.ContainsKey(objectPrefix))
542 | objectIdsPrefixHistogram.Add(objectPrefix, 1);
543 | else
544 | objectIdsPrefixHistogram[objectPrefix]++;
545 | }
546 | else // value
547 | {
548 | // "9"
549 | // "Laurens Maturana"@en
550 | // "Turtlewax (TurtleWax/6920a285ab8b7f7f) 2014-08-04T18:25:09.133-07:00"@en
551 | // "1922-06-12"^^
552 | var lastQuotationIndex = parts[2].LastIndexOf('"');
553 | objectG = parts[2].Substring(1, lastQuotationIndex - 1);
554 | if (parts[2].Length > lastQuotationIndex + 1 && parts[2][lastQuotationIndex + 1] == '@')
555 | {
556 | lang = parts[2].Substring(lastQuotationIndex + 2);
557 |
558 | if (parts[2].Length > lastQuotationIndex + 3 && lang.StartsWith("en"))
559 | {
560 | objectValueEnglishLangIdCount++;
561 | }
562 | else
563 | {
564 | objectValueForeignLangIdCount++;
565 | if (predicate != "type.object.name" || !subject.StartsWith("m."))
566 | continue; // filter out foreign stuff, except entity names
567 | }
568 | }
569 | else
570 | {
571 | objectValueNoLangIdCount++;
572 | }
573 | objectValueCount++;
574 | }
575 |
576 | // Before moving to the next subject, output the name of the entity
577 | if (subject != lastSubj)
578 | {
579 | if (lastSubj != "")
580 | {
581 | // Output the entity names
582 | blExistEn = false;
583 | foreach (var langEn in langOrder)
584 | {
585 | if (dtLang2Name.ContainsKey(langEn))
586 | {
587 | writerDecompressedEnglish.WriteLine(lastSubj + "\ttype.object.name\t" + dtLang2Name[langEn]);
588 | blExistEn = true;
589 | break;
590 | }
591 | }
592 |
593 | if (!blExistEn) // write all names
594 | {
595 | foreach (var objG in dtLang2Name.Values.Distinct())
596 | writerDecompressedEnglish.WriteLine(lastSubj + "\ttype.object.name\t" + objG);
597 | }
598 | }
599 |
600 | // reset variables
601 | lastSubj = subject;
602 | dtLang2Name = new Dictionary();
603 | }
604 |
605 | if (subject.StartsWith("m."))
606 | {
607 | if (predicate == "type.object.name") // Store first and output in the end
608 | dtLang2Name[lang] = objectG;
609 | else
610 | writerDecompressedEnglish.WriteLine(subject + "\t" + predicate + "\t" + objectG);
611 | }
612 | else
613 | {
614 | writernonM.WriteLine(subject + "\t" + predicate + "\t" + objectG);
615 | }
616 | }
617 |
618 | // Final processing of the name of the entity
619 | blExistEn = false;
620 | foreach (var langEn in langOrder)
621 | {
622 | if (dtLang2Name.ContainsKey(langEn))
623 | {
624 | writerDecompressedEnglish.WriteLine(lastSubj + "\ttype.object.name\t" + dtLang2Name[langEn]);
625 | blExistEn = true;
626 | break;
627 | }
628 | }
629 | if (!blExistEn) // write all names
630 | {
631 | foreach (var objG in dtLang2Name.Values.Distinct())
632 | writerDecompressedEnglish.WriteLine(lastSubj + "\ttype.object.name\t" + objG);
633 | }
634 |
635 |
636 | writerConsole.WriteLine("Total Triples: " + totalTriples);
637 |
638 | writerConsole.WriteLine("Num parts != 4: " + moreThan4PartsCount);
639 |
640 | writerConsole.WriteLine("Subject: ");
641 | subjectFirstTwoLettersHistogram.Keys.Select(subj => new {subj, count = subjectFirstTwoLettersHistogram[subj]})
642 | .Where(e => e.count > 1000)
643 | .OrderByDescending(e => e.count).ToList()
644 | .ForEach(e => writerConsole.WriteLine("\t" + e.count + ": " + e.subj));
645 |
646 | writerConsole.WriteLine("Before Predicate");
647 | beforePredicateHistogram.Keys.Select(beforeP => new {beforeP, count = beforePredicateHistogram[beforeP]})
648 | .Where(e => e.count > 1000)
649 | .OrderByDescending(e => e.count).ToList()
650 | .ForEach(e => writerConsole.WriteLine("\t" + e.count + ": " + e.beforeP));
651 |
652 | writerConsole.WriteLine("Object Ids");
653 | objectIdsPrefixHistogram.Keys.Select(obj => new {obj, count = objectIdsPrefixHistogram[obj]})
654 | .Where(e => e.count > 1000)
655 | .OrderByDescending(e => e.count).ToList()
656 | .ForEach(e => writerConsole.WriteLine("\t" + e.count + ": " + e.obj));
657 |
658 | var objectsWithIdsCount = objectIdsPrefixHistogram.Keys.Select(e => objectIdsPrefixHistogram[e]).Sum();
659 | writerConsole.WriteLine("Objects With Ids: " + objectsWithIdsCount);
660 | writerConsole.WriteLine("Objects With Values: " + objectValueCount);
661 | writerConsole.WriteLine(" No Lang Id: " + objectValueNoLangIdCount);
662 | writerConsole.WriteLine(" English: " + objectValueEnglishLangIdCount);
663 | writerConsole.WriteLine(" Foreign: " + objectValueForeignLangIdCount);
664 | }
665 | }
666 | }
667 | }
668 | }
669 | catch (Exception e)
670 | {
671 | Console.WriteLine(e.Message + Environment.NewLine + (e.InnerException ?? new Exception("")).Message);
672 | }
673 | }
674 |
675 | public static void FindGhost(CommandLineArguments cmd)
676 | {
677 | var fnNameTable = Path.Combine(cmd.idir, "namesTable.bin");
678 | var fnTupleFile1 = Path.Combine(cmd.idir, "fb_en.txt");
679 | var fnTupleFile2 = Path.Combine(cmd.idir, "fb_en_nonM.txt");
680 | var fnGhost = Path.Combine(cmd.odir, "ghost_mid.txt");
681 |
682 | HashSet setObj = new HashSet(), setSub = new HashSet();
683 | long lnCnt = 0;
684 | var namesTable = DeserializeRelationTable(File.OpenRead(fnNameTable));
685 |
686 | var startTime = DateTime.Now;
687 |
688 | foreach (var ln in File.ReadLines(fnTupleFile1).Concat(File.ReadLines(fnTupleFile2)))
689 | {
690 | if (++lnCnt % 1000000 == 0)
691 | {
692 | var retrieveSec = (DateTime.Now - startTime).TotalSeconds;
693 | Console.Error.WriteLine("[{0:0.00}] Processed {1}M lines.", retrieveSec, lnCnt / 1000000);
694 | }
695 | var f = ln.Split('\t');
696 | string sub = f[0], obj = f[2];
697 |
698 | if ((obj.StartsWith("m.") || obj.StartsWith("g.")) && // not a value node
699 | (!namesTable.ContainsKey(obj))) // not an entity node, a candidate ghost object
700 | setObj.Add(obj);
701 |
702 | if (!namesTable.ContainsKey(sub)) // not an entity node
703 | setSub.Add(sub);
704 | }
705 |
706 | var fGhost = new StreamWriter(fnGhost);
707 | foreach (var x in setObj.Except(setSub))
708 | {
709 | fGhost.WriteLine("{0}", x);
710 | }
711 | fGhost.Close();
712 | }
713 |
714 | private static Dictionary DeserializeRelationTable(Stream stream)
715 | {
716 | BinaryReader reader = new BinaryReader(stream);
717 | int dictionaryCount = reader.ReadInt32();
718 | Dictionary relationDictionary = new Dictionary(dictionaryCount);
719 | for (int i = 0; i < dictionaryCount; i++)
720 | {
721 | string key = reader.ReadString();
722 | string value = reader.ReadString();
723 | relationDictionary[key] = value;
724 | }
725 | return relationDictionary;
726 | }
727 | }
728 | }
--------------------------------------------------------------------------------
/FreebaseToRDFStore/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | // General Information about an assembly is controlled through the following
6 | // set of attributes. Change these attribute values to modify the information
7 | // associated with an assembly.
8 | [assembly: AssemblyTitle("FreebaseToRDFStore")]
9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("")]
12 | [assembly: AssemblyProduct("FreebaseToRDFStore")]
13 | [assembly: AssemblyCopyright("Copyright © 2015")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 |
17 | // Setting ComVisible to false makes the types in this assembly not visible
18 | // to COM components. If you need to access a type in this assembly from
19 | // COM, set the ComVisible attribute to true on that type.
20 | [assembly: ComVisible(false)]
21 |
22 | // The following GUID is for the ID of the typelib if this project is exposed to COM
23 | [assembly: Guid("0f2e5483-99ef-4d7a-a4bd-aad7573084d5")]
24 |
25 | // Version information for an assembly consists of the following four values:
26 | //
27 | // Major Version
28 | // Minor Version
29 | // Build Number
30 | // Revision
31 | //
32 | // You can specify all the values or you can default the Build and Revision Numbers
33 | // by using the '*' as shown below:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("1.0.0.0")]
36 | [assembly: AssemblyFileVersion("1.0.0.0")]
37 |
--------------------------------------------------------------------------------
/FreebaseToRDFStore/app.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/FreebaseToRDFStore/packages.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/LICENSE.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FastRDFStore/42c5ae4a9ed5b18bdcddb3ada448f254eac579da/LICENSE.docx
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | **Microsoft Research License Agreement for MSR FastRDFStore**
2 |
3 | This Microsoft Research License Agreement ("Agreement") is a legal agreement between you and Microsoft Corporation ("Microsoft"). Please read it and all of its terms and conditions. They apply to the Microsoft Research Deliverables which may include source code, object code, data, and any associated materials, text or speech files, associated media and "online" or electronic documentation, and any updates we provide in our discretion (together, the "Deliverables"). This Agreement also applies to any Microsoft (i) updates, (ii) supplements, (iii) internet-based services, and (iv) support services for this Deliverables, unless other terms accompany those items. If so, those terms supplement this Agreement and apply to the extent they are in conflict with this Agreement.
4 |
5 | By agreeing to this Agreement and/or by using the Deliverables, you accept these terms and conditions. If you do not accept them, do not use the Deliverables. If you comply with these license terms and conditions, you have the rights described below.
6 |
7 | 1. **SCOPE OF RIGHTS.**
8 |
9 | **License Grant.** Subject to the terms of this Agreement, you have the following rights to the Deliverables for non-commercial, research only purposes:
10 | 1. **Source Code**: You may use, copy, modify, and distribute the source code.
11 | 2. **Object Code**: You may use, copy, and distribute the object code.
12 | 3. **Restrictions.** You may not (i) alter any copyright, trademark or patent notice in the Deliverables; (ii) use Microsoft's trademarks in a way that suggests your derivative works or modifications come from or are endorsed by Microsoft; or (iii) include the Deliverables in malicious, deceptive or unlawful programs.
13 | 4. **Requirements.** (i) You may only distribute the Deliverables, or any derivative works of the Deliverables as part of, and only for use with, your non-commercial offering licensed under this Agreement, and you are not authorized to distribute them under any terms and conditions that are broader than, conflict with or are different from those provided by this Agreement, and (ii) If you have created derivative works of the Deliverables, and distribute such derivative works, you will cause the modifications to carry prominent notices so that recipients know that they are not receiving the original Deliverables. Such notices must state: (i) that you have changed the Deliverables; (ii) what portions have changed, and (iii) the date of any changes.
14 |
15 | 2. **RESERVATION OF RIGHTS.** The Deliverables are licensed, not sold. This Agreement only gives you some rights to use the Deliverables with respect to the intellectual property Microsoft owns in the Deliverables ("Microsoft IP") and your rights are conditioned on you not receiving any license or other rights in any intellectual property other than the Microsoft IP, even if such license or rights are necessary for you to use the Deliverables. Microsoft reserves all other rights. The license from Microsoft under this Agreement only applies to the Deliverables as provided by Microsoft, not to any modifications or derivative works you make. In using the Deliverables, you must comply with any technical limitations in the Deliverables that may only allow you to use it in certain ways. You may not:
16 |
17 | 1. a.Work around any technical limitations in the Deliverables;
18 | 2. b.Reverse engineer, decompile or disassemble the Deliverables, except and only to the extent that applicable law expressly permits, despite this limitation;
19 | 3. c.Use the Deliverables for commercial software hosting services or other commercial purposes;
20 | 4. d.Make more copies of the Deliverables than specified in this Agreement or allowed by applicable law, despite this limitation; or
21 | 5. e.Rent, lease or lend the Deliverables.
22 |
23 | 3. **License to Microsoft.** In the event you provide Microsoft with modifications or derivatives of the Deliverables, you hereby grant Microsoft, without any restrictions or limitations, a non-exclusive, perpetual, irrevocable, royalty-free, assignable and sub-licensable license, to reproduce, publicly perform or display, install, use, modify, post, distribute, make and have made, sell and transfer such contributions, modifications and derivatives for any purpose.
24 | 4. **FEEDBACK.** Any feedback about the Deliverables provided by you to us is voluntarily given, and Microsoft shall be free to commercialize and use the feedback as it sees fit without obligation or restriction of any kind, even if the feedback is designated by you as confidential. Such feedback shall be considered a contribution and licensed to Microsoft under the terms of Section 4 above.
25 | 5. **NO SUPPORT.** Microsoft is under no obligation to provide any support or additional materials for the Deliverables. Nor is Microsoft obligated to update or use the Deliverables.
26 | 6. **TERM; TERMINATION.** The term of this Agreement will commence upon your acceptance of these license terms and conditions and will continue indefinately unless terminated earlier as provided herein. If you breach this Agreement or if you sue Microsoft or any other party over intellectual property that you think may apply to or read on the Deliverables or anyone's use of the Deliverables, this Agreement (and your license and rights obtained herein) terminate automatically. If this Agreement expires or is terminated, you must cease all activities related to the Deliverables and any derivative works or modifications and return or certify destruction of the Deliverables and all copies.
27 | 7. **EXPORT RESTRICTIONS.** The Deliverables are subject to United States export laws and regulations. You must comply with all domestic and international export laws and regulations that apply to the Deliverables. These laws include restrictions on destinations, end users and end use. For additional information, see [www.microsoft.com/exporting](http://www.microsoft.com/exporting).
28 | 8. **ENTIRE AGREEMENT.** This Agreement, any exhibits, and the terms for any supplements, updates, Internet-based services or support services that you use, are the entire agreement for the Deliverables and support services.
29 | 9. **SEVERABILITY.** If any court of competent jurisdiction determines that any provision of this Agreement is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect provided the intent of the parties can be preserved.
30 | 10. **Governing Law and Venue.** This Agreement is governed by and construed in accordance with the laws of the state of Washington, without reference to its choice of law principles to the contrary. Each party hereby consents to the jurisdiction and venue of the state and federal courts located in King County, Washington, with regard to any suit or claim arising under or by reason of this Agreement.
31 | 11. **LEGAL EFFECT.** This Agreement describes certain legal rights. You may have other rights under the laws of your country. This Agreement does not change your rights under the laws of your country if the laws of your country do not permit it to do so.
32 | 12. **NO ASSIGNMENT.** You may not assign this Agreement or any rights or obligations hereunder, except with Microsoft's express written consent. Any attempted assignment in violation of this section will be void.
33 | 13. **DISCLAIMER OF WARRANTY; LIMITATION OF LIABILITY. The DELIVERABLES ARE PROVIDED AND licensed "as-is." You bear the risk of using THEM. Microsoft gives no express warranties, guarantees or conditions. To the MAXIMUM extent permitted under law, Microsoft excludes ALL WARRANTIES INCLUDING the implied warranties of merchantability, fitness for a particular purpose and non-infringement. IN NO EVENT SHALL MICROSOFT BE LIABLE FOR ANY INDIRECT, INCIDENTAL, SPECIAL OR CONSEQUENTIAL DAMAGES, INCLUDING THE LOSS OF REVENUE, DATA OR USE OR THE COST OF PROCUREMENT OF SUBSTITUTE SERVICES, INCURRED OR SUFFERED BY YOU OR ANY THIRD PARTY IN CONNECTION WITH THIS AGREEMENT, WHETHER IN AN ACTION IN CONTRACT, TORT, BASED ON A WARRANTY OR OTHERWISE, EVEN IF MICROSOFT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT SHALL MICROSOFT'S AGGREGATE LIABILITY UNDER THIS AGREEMENT EXCEED FIVE THOUSAND U.S. DOLLARS (US $5,000). THE PARTIES ACKNOWLEDGE AND AGREE THAT THIS DISCLAIMER AND THE LIMITATION OF LIABILITY ARE FUNDAMENTAL PARTS OF THIS AGREEMENT AND MICROSOFT WOULD NOT AGREE TO ENTER INTO THIS AGREEMENT WITHOUT SUCH DISCLAIMER AND LIMITATION.**
34 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #MSR FastRDFStore Package
2 | -----
3 |
4 | ## Overview
5 |
6 | The MSR FastRDFStore Package is designed for creating an in-memory index of RDF triples, implemented as a WCF service in C#, and consists of server & client side code. RDF triples are the standard format for storing structured knowledge graphs. Instead of relying on a complete SPARQL server engine to index and serve the data from RDF triples, our software package provides the essential functions for traversing the knowledge graph in a much more efficient way.
7 |
8 | In addition to the binary executables and the source code, the package includes the last dump of Freebase ([freebase-rdf-2015-08-09-00-01.gz](https://developers.google.com/freebase/)), as well as the processed version ready to load directly into FastRDFStore. The data release needs to be downloaded separately from Microsoft Download Center ([MSR FastRDFStore Package - Data Release](https://www.microsoft.com/en-us/download/details.aspx?id=54511)). Users who would like to use the package for Freebase do not need to compile the package and process the raw data, but instead can run the executables directly. The executables can be directly run on Windows, or on Linux using [Mono](http://www.mono-project.com/ "Cross platform, open source .NET framework").
9 |
10 | FastRDFStore was originally designed to support the creation of the [WebQuestions Semantic Parses Dataset (WebQSP)](https://www.aka.ms/WebQSP "WebQuestions Semantic Parses Dataset"). Details on this dataset can be found at our ACL-2016 paper: Yih, Richardson, Meek, Chang & Suh. "[The Value of Semantic Parse Labeling for Knowledge Base Question Answering](https://aclweb.org/anthology/P/P16/P16-2033.pdf)."
11 |
12 | ## Run FastRDFStore on Freebase
13 |
14 | If you just need to run the FastRDFStore WCF server on the Freebase data provided in this package, simply use the following command to start the FastRDFStore server.
15 |
16 | * ```bin\FastRDFStore.exe -i data```
17 |
18 | Notice that running the FastRDFStore service to serve this Freebase data will need about 50GB memory. Initializing the server takes about 14 minutes. Once the service starts, you can use the command line client tool to test it.
19 |
20 | * ```bin\FastRDFStoreClient.exe```
21 |
22 | By typing an entity id in Freebase(i.e., MID), it will output the triples where the given MID is the subject. When the object is a CVT node, it will output triples with the CVT node as the subject as well. Below is an example:
23 |
24 | ```
25 | Enter subject: m.0c5g7w5
26 | common.topic.notable_for --> CVT (g.1yg9b9lpq)
27 | common.notable_for.predicate --> /type/object/type
28 | common.notable_for.display_name --> Musical Track
29 | --> Musical Recording
30 | common.notable_for.object --> Musical Recording (m.0kpv11)
31 | common.notable_for.notable_object --> Musical Recording (m.0kpv11)
32 | base.schemastaging.topic_extra.review_webpage --> Round_%2526_Round_(Selena_Gomez_%2526_the_Scene_song)
33 | music.recording.contributions --> CVT (m.0ccbt6k)
34 | music.track_contribution.track --> Round & Round (m.0c5g7w5)
35 | music.track_contribution.contributor --> Selena Gomez (m.0gs6vr)
36 | common.topic.notable_types --> Musical Recording (m.0kpv11)
37 | music.recording.producer --> Kevin Rudolf (m.03f5drm)
38 | music.recording.length --> 308.0
39 | common.topic.webpage --> CVT (m.0ccbrdk)
40 | common.webpage.resource --> Wikipedia (m.0ccbrdf)
41 | common.webpage.category --> Review (m.09rg1d4)
42 | common.webpage.topic --> Round & Round (m.0c5g7w5)
43 | kg.object_profile.prominent_type --> Musical Track (music.recording)
44 | common.topic.article --> CVT (m.0ccbrm8)
45 | common.document.updated --> 2010-07-08T20:12:00.330017Z
46 | common.document.text --> \"Round & Round\" is a song by American band Selena Gomez & the Scene. The song was written by Selena Gomez, Fefe Dobson, and Cash Money's Kevin Rudolf, who also produced the song. The song is an electronica-based dance-pop song with rock and disco beats. It was released as the lead single from the band's sophomore album, A Year Without Rain on June 22, 2010.
47 | common.document.content --> type.object.name --> Round & Round
48 | Took 0.071488 seconds to retrieve results
49 | ```
50 |
51 | ## Details of Projects
52 |
53 | Below, we provide more detailed descriptions of the projects, data and other folders included in this package.
54 |
55 | ### FastRDFStore
56 |
57 | This is the RDFStore WCF service we provided. Available command line arguments are:
58 | ```
59 | bin\FastRDFStore.exe -h
60 | FastRDFStore.exe Usage:
61 |
62 |
63 | -i, --idir (Default: ) Directory containing *.bin files
64 | -s, --server (Default: localhost) Server [localhost]
65 | -p, --port (Default: 9358) Connect to the FastRDFStore server on this port
66 | -l, --log (Default: FastRDFStore.log) Log file. Set to empty to disable logging
67 | --help Display this help screen.
68 | ```
69 |
70 | Functions supported in this service are defined in the interface file ```IFastRDFStore.cs```:
71 |
72 | * ```string[] GetOutboundPredicates(string subjectMid);```
73 |
74 | Return all the predicates starting with *subjectMid*. If any predicate leads to a CVT node, then all outbound predicates from the CVT node are also followed. These predicates are represented by a space-delimited string "predicate1 predicate2", where predicate1 leads from the subjectMid to the CVT node, and predicate2 is a predicate off of the CVT node.
75 |
76 | * ```string[] GetEntityNames(string[] entMids);```
77 |
78 | Return the names of given entity ids (*entMids*). Names are determined using the "type.object.name" relation for the entity.
79 |
80 | * ```SimpleFBObject GetSimpleObjectPredicatesAndCVTs(string subjectMid, int maxPerPredicate, bool followCVT);```
81 |
82 | Returns a graph of predicates and objects reachable from the given subject. The SimpleFBObject contains all of predicates of which subjectMid is a subject, and for each predicate contains a list of all objects reachable by following the predicate from the given subject. It will also follow CVT nodes for one hop, if requested. That is, if a predicate points to a CVT node, then all outgoing predicates from that node (and all corresponding objects) will also be returned. Note that the same object may be reachable through more than one predicate, and is deduplicated in the returned graph.
83 |
84 | * ```SimpleFBObject GetSimpleObjectFilteredPredicateAndObjects(string subjectMid, string predicate);```
85 |
86 | Similar to GetSimpleObjectPredicatesAndCVTs, this returns a graph containing predicates and objects reachable from the given subjectMid. In this case, it is filtered to objects reachable via the given predicate. The predicate may be a space-delimited string containing two predicates in order to walk through a CVT node. For example, "music.recording.contributions music.track_contribution.track".
87 |
88 | * ```string[][] FindNodeSquencesOnPredicateChain(string startMid, string[] chainPredicates);```
89 |
90 | Return the lists of intermediate nodes connected by the given chain of predicates (*chainPredicates*) starting from the node *startMid*
91 |
92 | -----
93 |
94 | ### FastRDFStoreClient
95 |
96 | This command-line client tool is useful for querying the FastRDFStore service in either batch or interactive mode. Available command line arguments are:
97 |
98 | ```
99 | bin\FastRDFStoreClient.exe -h
100 | FastRDFStoreClient.exe Usage:
101 |
102 |
103 | -s, --server (Default: localhost) Connect to the FastRDFStore server on this server [localhost]
104 | -p, --port (Default: 9358) Connect to the FastRDFStore server on this port [9358]
105 | -d, --dump DumpMID
106 | -m, --mid MID to search for
107 | -t, --tripleOnly Triple Only
108 | --pred (optional) predicate for filtering
109 | -c, --chain Predicate chain to search for
110 | --help Display this help screen.
111 | ```
112 |
113 | When the MID is given, the code is in batch mode and dumps the results to standard output. This is useful when using a script to run FastRDFStore. Arguments --tripleOnly and --chain are only valid in batch mode; the former outputs only the triples with MID as the subject (without expanding the CVT triples) and the latter only outputs nodes on a given predicate chain.
114 |
115 | -----
116 |
117 | ### FastToRDFStore
118 |
119 | This is the utility to process the raw Freebase dump into binary and text data files used by FastRDFStore. For instance, taking the Freebase dump *freebase-rdf-2015-08-09-00-01.gz* as the original input file, we need to run the following commands to generate the data files.
120 |
121 | ```zcat freebase-rdf-2015-08-09-00-01.gz > data/freebase-rdf-latest```
122 |
123 | ```
124 | # Preserve only the Freebase triples needed
125 | bin\FreebaseToRDFStore.exe -c TrimData -i data -o data
126 | ```
127 |
128 | ```
129 | # Build the compressed, binary RDF store files
130 | bin\FreebaseToRDFStore.exe -c BuildStore -i data -o data
131 | ```
132 |
133 | ```
134 | # Find ghost entity nodes that no subject nodes can link to
135 | bin\FreebaseToRDFStore.exe -c FindGhost -i data -o data
136 | ```
137 |
138 | Once you have run this sequence of commands, you can run the FastRDFStore server on the data directory, as outlined above.
139 |
140 | -----
141 |
142 | ### Notes on compiling using Mono
143 |
144 | When using Mono to compile FastRDFStore, the package CommandLineParser.1.9.71 needs to be installed first via NuGet.
145 | ```
146 | $ wget http://nuget.org/nuget.exe -P bin
147 | $ mono bin/nuget.exe install FastRDFStore/packages.config -OutputDirectory packages
148 | ```
149 | After that, you can then directly run *xbuild*.
150 | ```
151 | $ xbuild FastRDFStore.sln
152 | ```
153 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 |
41 |
42 |
--------------------------------------------------------------------------------
/SharedDataTypes/Config.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.IO;
3 |
4 | namespace CSI
5 | {
6 | static public class Config
7 | {
8 | public const string dirWork = @"\\tspace10\e$\users\tmsnwork";
9 | public static string dirDat = Path.Combine(dirWork, "Data");
10 | public static string fnStopWords = Path.Combine(dirDat, "short-stopwords.txt");
11 | public static HashSet setMaleKW = new HashSet(new string[] { "dad", "father", "brother", "grandfather", "grandson", "son", "husband" });
12 | public static HashSet setFemaleKW = new HashSet(new string[] { "mom", "mother", "sister", "grandmother", "granddaughter", "daughter", "wife" });
13 | public static HashSet setTimeKW = new HashSet(new string[] { "when", "time", "year", "date", "old", "birthdate", "birthday" });
14 | public const int MaxEntityCandidates = 12;
15 |
16 | public const double Epsilon = 1e-10;
17 | }
18 | }
--------------------------------------------------------------------------------
/SharedDataTypes/FreebaseCommonTypes.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.Serialization;
3 |
4 | namespace FastRDFStore
5 | {
6 |
7 | public enum FBNodeType
8 | {
9 | Value,
10 | Entity,
11 | CVT
12 | };
13 |
14 | [DataContract(IsReference = true)]
15 | public class PredicateAndObjects
16 | {
17 | [DataMember]
18 | public string Predicate { get; set; }
19 |
20 | [DataMember]
21 | public FBObject[] Objects { get; set; }
22 | }
23 |
24 | // A FBObject can either be:
25 | // - a simple value ("42"): ValueFBObject(value="42")
26 | // - an entity (Ireland): SimpleFBObject(mid="m.012wgb", name="Ireland")
27 | // - a CVT node (dated integer value) CVTFBObject
28 | [DataContract(IsReference = true)]
29 | [KnownType(typeof (ValueFBObject))]
30 | [KnownType(typeof (SimpleFBObject))]
31 | [KnownType(typeof (CVTFBObject))]
32 | public abstract class FBObject
33 | {
34 | public abstract string PrettyString();
35 | public abstract string GetNameOrValue();
36 |
37 | public virtual string GetMid() { return String.Empty; }
38 | }
39 |
40 | [DataContract(IsReference = true)]
41 | public class ValueFBObject : FBObject
42 | {
43 | [DataMember]
44 | public string Value { get; set; }
45 |
46 | public override string PrettyString() { return Value; }
47 | public override string GetNameOrValue() { return Value; }
48 | }
49 |
50 | [DataContract(IsReference = true)]
51 | public class SimpleFBObject : FBObject
52 | {
53 | [DataMember]
54 | public string Mid { get; set; }
55 |
56 | [DataMember]
57 | public string Name { get; set; }
58 |
59 | [DataMember]
60 | public PredicateAndObjects[] Objects { get; set; }
61 |
62 | // Guaranteed that each predicate appears only once. May be null
63 |
64 | public override string PrettyString() { return Name; }
65 | public override string GetNameOrValue() { return Name; }
66 | public override string GetMid() { return Mid; }
67 | }
68 |
69 | [DataContract(IsReference = true)]
70 | public class CVTFBObject : FBObject
71 | {
72 | [DataMember]
73 | public string Mid { get; set; }
74 |
75 | [DataMember]
76 | public PredicateAndObjects[] Objects { get; set; }
77 |
78 | public override string PrettyString() { return "[CVT " + Mid + "]"; }
79 | public override string GetNameOrValue() { return ""; }
80 | public override string GetMid() { return Mid; }
81 | }
82 |
83 |
84 | }
--------------------------------------------------------------------------------
/bin/Acknowledgment.md:
--------------------------------------------------------------------------------
1 | We use the [Command Line Parser Library](https://github.com/gsscoder/commandline) v.1.9.71 to parse the arguments,
2 | and include the library file CommandLine.dll in this folder.
3 | The [license of Command Line Parser Library](https://github.com/gsscoder/commandline/blob/master/License.md) is copied below:
4 |
5 | The MIT License (MIT)
6 |
7 | Copyright (c) 2005 - 2015 Giacomo Stelluti Scala & Contributors
8 |
9 | Permission is hereby granted, free of charge, to any person obtaining a copy
10 | of this software and associated documentation files (the "Software"), to deal
11 | in the Software without restriction, including without limitation the rights
12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 | copies of the Software, and to permit persons to whom the Software is
14 | furnished to do so, subject to the following conditions:
15 |
16 | The above copyright notice and this permission notice shall be included in
17 | all copies or substantial portions of the Software.
18 |
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 | THE SOFTWARE.
26 |
--------------------------------------------------------------------------------
/bin/CommandLine.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FastRDFStore/42c5ae4a9ed5b18bdcddb3ada448f254eac579da/bin/CommandLine.dll
--------------------------------------------------------------------------------
/bin/FastRDFStore.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FastRDFStore/42c5ae4a9ed5b18bdcddb3ada448f254eac579da/bin/FastRDFStore.exe
--------------------------------------------------------------------------------
/bin/FastRDFStoreClient.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FastRDFStore/42c5ae4a9ed5b18bdcddb3ada448f254eac579da/bin/FastRDFStoreClient.exe
--------------------------------------------------------------------------------
/bin/FreebaseToRDFStore.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/FastRDFStore/42c5ae4a9ed5b18bdcddb3ada448f254eac579da/bin/FreebaseToRDFStore.exe
--------------------------------------------------------------------------------
/bin/README.md:
--------------------------------------------------------------------------------
1 | The .exe execuatable files were compiled on Dec. 2, 2016. CommandLine.dll is from [Command Line Parser Library](https://github.com/gsscoder/commandline) 1.9.71 for CLR.
2 |
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | Please download the data from Microsoft Download Center ([MSR FastRDFStore Package - Data Release](https://www.microsoft.com/en-us/download/details.aspx?id=54511)).
2 |
--------------------------------------------------------------------------------