diff --git a/README b/README --- a/README +++ b/README @@ -10,35 +10,21 @@ ## Location Either: -- /etc/softwareheritage/loader/svn.ini -- ~/.config/swh/loader/svn.ini -- ~/.swh/loader/svn.ini +- /etc/softwareheritage/ +- ~/.config/swh/ +- ~/.swh/ + +Note: Will call that location $SWH_CONFIG_PATH ## Configuration sample +$SWH_CONFIG_PATH/loader/svn.yml: ``` storage: cls: remote args: url: http://localhost:5002/ -send_contents: true -send_directories: true -send_revisions: true -send_releases: true -send_occurrences: true -# nb of max contents to send for storage -content_packet_size: 10000 -# 100 Mib of content data -content_packet_block_size_bytes: 104857600 -# limit for swh content storage for one blob (beyond that limit, the -# content's data is not sent for storage) -content_packet_size_bytes: 1073741824 -directory_packet_size: 2500 -revision_packet_size: 10 -release_packet_size: 1000 -occurrence_packet_size: 1000 - check_revision: 10 ``` @@ -47,18 +33,44 @@ With at least the following module (swh.loader.svn.tasks) and queue (swh_loader_svn): - +$SWH_CONFIG_PATH/worker.yml: ``` -[main] -task_broker = amqp://guest@localhost// -task_modules = swh.loader.svn.tasks -task_queues = swh_loader_svn +task_broker: amqp://guest@localhost// +task_modules: +task_modules: + - swh.loader.svn.tasks +task_queues: + - swh_loader_svn task_soft_time_limit = 0 ``` -swh.loader.svn.tasks and swh_loader_svn are the important entries here. +`swh.loader.svn.tasks` and `swh_loader_svn` are the important entries here. + +## toplevel + +``` +$ python3 +repo = 'pyang-repo-r343-eol-native-mixed-lf-crlf' +#repo = 'zipeg-gae' +origin_url = 'http://%s.googlecode.com' % repo +local_repo_path = '/home/storage/svn/repo' +svn_url = 'file://%s/%s' % (local_repo_path, repo) + +import logging +logging.basicConfig(level=logging.DEBUG) + +from swh.loader.svn.tasks import LoadSvnRepository + +t = LoadSvnRepository() +t.run(svn_url=svn_url, + destination_path='/tmp', + origin_url=origin_url, visit_date='2016-05-03T15:16:32+00:00', + start_from_scratch=True) +``` + +## Production like -## start worker instance +start worker instance To start a current worker instance: diff --git a/docs/comparison-git-svn-swh-svn.org b/docs/comparison-git-svn-swh-svn.org --- a/docs/comparison-git-svn-swh-svn.org +++ b/docs/comparison-git-svn-swh-svn.org @@ -279,53 +279,53 @@ CLOSED: [2016-05-12 Thu 14:54] #+BEGIN_SRC sh - info | [2016-04-15 18:21:27,874: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[7d697a6b-ae8b-4718-b226-1406af717954] + info | [2016-04-15 18:21:27,874: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[7d697a6b-ae8b-4718-b226-1406af717954] debug | [2016-04-15 18:22:18,522: DEBUG/Worker-1] svn co svn://svn.debian.org/svn/pkg-fox/@1 info | [2016-04-15 18:22:19,193: INFO/Worker-1] [revision_start-revision_end]: [1-145] info | [2016-04-15 18:22:19,207: INFO/Worker-1] Repo {'remote_url': 'svn://svn.debian.org/svn/pkg-fox', 'local_url': '/tmp/tmp.wzzvlwuw.swh.loader/pkg-fox', 'uuid': 'd908f651-7add-0310-a5d1-c7ac9dfebe41', 'swh-origin': 4} ready to be processed. - info | [2016-04-15 18:45:33,703: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[7d697a6b-ae8b-4718-b226-1406af717954] succeeded in 1445.8084549359046s: None + info | [2016-04-15 18:45:33,703: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[7d697a6b-ae8b-4718-b226-1406af717954] succeeded in 1445.8084549359046s: None #+END_SRC *** DONE glibc-bsd CLOSED: [2016-05-12 Thu 14:54] #+BEGIN_SRC log - info | [2016-04-15 15:32:48,048: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[a41fba9b-f397-493a-a95f-deb673f91156] + info | [2016-04-15 15:32:48,048: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[a41fba9b-f397-493a-a95f-deb673f91156] info | [2016-04-15 15:32:59,607: INFO/Worker-1] [revision_start-revision_end]: [1-6006] info | [2016-04-15 15:32:59,620: INFO/Worker-1] Repo {'remote_url': 'svn://svn.debian.org/svn/glibc-bsd', 'local_url': '/tmp/tmp.bfeb_zdv.swh.loader/glibc-bsd', 'uuid': 'ae44cbe4-c7d5-0310-ae45-95c72a56cd7d', 'swh-origin': 3} ready to be processed. - info | [2016-04-15 18:21:27,855: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[a41fba9b-f397-493a-a95f-deb673f91156] succeeded in 10118.787201701081s: None + info | [2016-04-15 18:21:27,855: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[a41fba9b-f397-493a-a95f-deb673f91156] succeeded in 10118.787201701081s: None #+END_SRC *** DONE pkg-voip CLOSED: [2016-05-12 Thu 14:54] #+BEGIN_SRC sh - info | [2016-04-23 21:32:56,252: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[9a78bddb-227f-4f8a-b245-482a462e0000] + info | [2016-04-23 21:32:56,252: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[9a78bddb-227f-4f8a-b245-482a462e0000] debug | [2016-04-23 21:32:57,781: DEBUG/Worker-9] svn co svn://svn.debian.org/svn/pkg-voip/@1 - info | [2016-04-23 21:32:56,252: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[9a78bddb-227f-4f8a-b245-482a462e0000] + info | [2016-04-23 21:32:56,252: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[9a78bddb-227f-4f8a-b245-482a462e0000] info | [2016-04-23 21:32:58,221: INFO/Worker-9] Repo {'remote_url': 'svn://svn.debian.org/svn/pkg-voip', 'local_url': '/tmp/tmp.nwuhzku9.swh.loader/pkg-voip', 'uuid': '5e74be4b-f5d6-0310-a852-e9e23c5afa6a', 'swh-origin': 32} ready to be processed. info | [2016-04-23 21:32:58,186: INFO/Worker-9] [revision_start-revision_end]: [1-10707] - info | [2016-04-24 10:21:28,897: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[9a78bddb-227f-4f8a-b245-482a462e0000] succeeded in 46112.436119881924s: None + info | [2016-04-24 10:21:28,897: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[9a78bddb-227f-4f8a-b245-482a462e0000] succeeded in 46112.436119881924s: None #+END_SRC *** DONE python-modules CLOSED: [2016-05-12 Thu 14:54] #+BEGIN_SRC sh - info | [2016-04-28 17:35:59,087: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[776d23aa-e3c6-452d-95bd-7ae35409e9a5] + info | [2016-04-28 17:35:59,087: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[776d23aa-e3c6-452d-95bd-7ae35409e9a5] debug | [2016-04-28 17:36:00,036: DEBUG/Worker-27] svn co svn://svn.debian.org/svn/python-modules/@1 info | [2016-04-28 17:36:00,509: INFO/Worker-27] [revision_start-revision_end]: [1-34523] info | [2016-04-28 17:36:00,522: INFO/Worker-27] Repo {'remote_url': 'svn://svn.debian.org/svn/python-modules', 'local_url': '/tmp/tmp.7t45udhc.swh.loader/python-modules', 'uuid': '771dd761-d7fa-0310-a302-f036d1c1ebb6', 'swh-origin': 122} ready to be processed. - info | [2016-05-02 01:42:49,471: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[776d23aa-e3c6-452d-95bd-7ae35409e9a5] succeeded in 288410.36918153404s: None + info | [2016-05-02 01:42:49,471: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[776d23aa-e3c6-452d-95bd-7ae35409e9a5] succeeded in 288410.36918153404s: None #+END_SRC *** DONE pkg-gnome CLOSED: [2016-05-12 Thu 14:54] #+BEGIN_SRC log - info | [2016-04-16 20:02:34,346: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[b05b9228-7842-4cf1-9f8e-79edb462c262] + info | [2016-04-16 20:02:34,346: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[b05b9228-7842-4cf1-9f8e-79edb462c262] debug | [2016-04-16 20:02:35,262: DEBUG/Worker-7] svn co svn://svn.debian.org/svn/pkg-gnome/@1 info | [2016-04-16 20:02:35,625: INFO/Worker-7] [revision_start-revision_end]: [1-48013] info | [2016-04-16 20:02:35,629: INFO/Worker-48806] Archive gs://google-code-archive-source/v2/code.google.com/dennisbabiak-gccgcm/source-archive.zip fetched. info | [2016-04-16 20:02:35,641: INFO/Worker-7] Repo {'remote_url': 'svn://svn.debian.org/svn/pkg-gnome', 'local_url': '/tmp/tmp.uxt6n47f.swh.loader/pkg-gnome', 'uuid': 'db0db5de-e4c8-0310-9441-90abf70311f7', 'swh-origin': 27} ready to be processed. - info | [2016-04-23 17:16:17,149: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[b05b9228-7842-4cf1-9f8e-79edb462c262] succeeded in 594822.724728007s: None + info | [2016-04-23 17:16:17,149: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[b05b9228-7842-4cf1-9f8e-79edb462c262] succeeded in 594822.724728007s: None #+END_SRC ** git svn clone/fetch @@ -702,7 +702,7 @@ #+BEGIN_SRC txt Jun 11 20:13:30 worker01 python3[27823]: [tasks] Jun 11 20:13:30 worker01 python3[27823]: . swh.loader.core.tasks.LoaderCoreTask -Jun 11 20:13:30 worker01 python3[27823]: . swh.loader.svn.tasks.LoadSvnRepositoryTsk +Jun 11 20:13:30 worker01 python3[27823]: . swh.loader.svn.tasks.LoadSvnRepository Jun 11 20:13:30 worker01 python3[27823]: [2016-06-11 20:13:30,213: INFO/MainProcess] Connected to amqp://swhconsumer:**@moma:5672// Jun 11 20:13:30 worker01 python3[27823]: [2016-06-11 20:13:30,259: INFO/MainProcess] mingle: searching for neighbors Jun 11 20:13:31 worker01 python3[27823]: [2016-06-11 20:13:31,369: INFO/MainProcess] mingle: sync with 7 nodes @@ -711,13 +711,13 @@ Jun 11 20:15:04 worker01 python3[27823]: [2016-06-11 20:15:04,997: INFO/MainProcess] Started consuming from swh_loader_svn Jun 11 20:15:06 worker01 python3[27823]: [2016-06-11 20:15:06,204: INFO/MainProcess] Cancelling queue swh_loader_svn Jun 11 20:15:20 worker01 python3[27823]: [2016-06-11 20:15:20,724: INFO/MainProcess] Started consuming from swh_loader_svn -Jun 11 20:17:51 worker01 python3[27823]: [2016-06-11 20:17:51,487: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[7fab7834-0c41-4634-89f3-1af35502461a] +Jun 11 20:17:51 worker01 python3[27823]: [2016-06-11 20:17:51,487: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[7fab7834-0c41-4634-89f3-1af35502461a] Jun 11 20:17:52 worker01 python3[27823]: [2016-06-11 20:17:52,362: INFO/Worker-10] [revision_start-revision_end]: [1-145] Jun 11 20:17:52 worker01 python3[27823]: [2016-06-11 20:17:52,386: INFO/Worker-10] Processing {'remote_url': 'svn://svn.debian.org/svn/pkg-fox', 'uuid': b'd908f651-7add-0310-a5d1-c7ac9dfebe41', 'local_url': b'/tmp/tmp.bm6rebqz.swh.loader/p kg-fox', 'swh-origin': 4}. Jun 11 20:18:55 worker01 python3[27823]: [2016-06-11 20:18:55,307: INFO/Worker-10] Processed 145 revisions: [1dda85506a12af80c5a701a02aba5a02c703642f, ...] -Jun 11 20:18:58 worker01 python3[27823]: [2016-06-11 20:18:58,078: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[7fab7834-0c41-4634-89f3-1af35502461a] succeeded in 66.56195999495685s: None -Jun 11 20:18:58 worker01 python3[27823]: [2016-06-11 20:18:58,106: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[ace251b8-7255-4e63-90b5-1a56655755e8] +Jun 11 20:18:58 worker01 python3[27823]: [2016-06-11 20:18:58,078: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[7fab7834-0c41-4634-89f3-1af35502461a] succeeded in 66.56195999495685s: None +Jun 11 20:18:58 worker01 python3[27823]: [2016-06-11 20:18:58,106: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[ace251b8-7255-4e63-90b5-1a56655755e8] Jun 11 20:18:58 worker01 python3[27823]: [2016-06-11 20:18:58,437: INFO/Worker-10] [revision_start-revision_end]: [1-6073] Jun 11 20:18:58 worker01 python3[27823]: [2016-06-11 20:18:58,453: INFO/Worker-10] Processing {'remote_url': 'svn://svn.debian.org/svn/glibc-bsd', 'uuid': b'ae44cbe4-c7d5-0310-ae45-95c72a56cd7d', 'local_url': b'/tmp/tmp.d_iw28du.swh.loader/glibc-bsd', 'swh-origin': 3}. Jun 11 20:20:07 worker01 python3[27823]: [2016-06-11 20:20:07,371: INFO/Worker-10] Processed 1000 revisions: [be6fe97464c0fedd9959073d07b2fda4cbedbe2d, ...] @@ -727,8 +727,8 @@ Jun 11 20:23:17 worker01 python3[27823]: [2016-06-11 20:23:17,551: INFO/Worker-10] Processed 1000 revisions: [b2c7a10f2127dd496048133480df041b0ab66865, ...] Jun 11 20:24:29 worker01 python3[27823]: [2016-06-11 20:24:29,274: INFO/Worker-10] Processed 1000 revisions: [382341a00301f36ceec9fca563aef85cb628b323, ...] Jun 11 20:24:35 worker01 python3[27823]: [2016-06-11 20:24:35,546: INFO/Worker-10] Processed 73 revisions: [512a9f720bd1af1581b09483846035cf292c52cd, ...] -Jun 11 20:24:36 worker01 python3[27823]: [2016-06-11 20:24:36,205: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[ace251b8-7255-4e63-90b5-1a56655755e8] succeeded in 338.06703379005194s: None -Jun 11 20:24:36 worker01 python3[27823]: [2016-06-11 20:24:36,240: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[275183f6-ef9e-4533-aa72-322f080b76e1] +Jun 11 20:24:36 worker01 python3[27823]: [2016-06-11 20:24:36,205: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[ace251b8-7255-4e63-90b5-1a56655755e8] succeeded in 338.06703379005194s: None +Jun 11 20:24:36 worker01 python3[27823]: [2016-06-11 20:24:36,240: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[275183f6-ef9e-4533-aa72-322f080b76e1] Jun 11 20:24:36 worker01 python3[27823]: [2016-06-11 20:24:36,584: INFO/Worker-10] [revision_start-revision_end]: [1-10707] Jun 11 20:24:36 worker01 python3[27823]: [2016-06-11 20:24:36,600: INFO/Worker-10] Processing {'remote_url': 'svn://svn.debian.org/svn/pkg-voip', 'uuid': b'5e74be4b-f5d6-0310-a852-e9e23c5afa6a', 'local_url': b'/tmp/tmp.0y0ny007.swh.loader/pkg-voip', 'swh-origin': 32}. Jun 11 20:25:34 worker01 python3[27823]: [2016-06-11 20:25:34,718: INFO/Worker-10] Processed 1000 revisions: [a1fff6a0e5e397f634d0ea5c1600bc723d019e4c, ...] @@ -742,8 +742,8 @@ Jun 11 20:30:18 worker01 python3[27823]: [2016-06-11 20:30:18,890: INFO/Worker-10] Processed 1000 revisions: [943042e9d1113406483a2a5b6f39023935c0532a, ...] Jun 11 20:32:20 worker01 python3[27823]: [2016-06-11 20:32:20,890: INFO/Worker-10] Processed 1000 revisions: [81405fca1c7f928fcc6a2b137546b4f94f22551e, ...] Jun 11 20:33:29 worker01 python3[27823]: [2016-06-11 20:33:29,927: INFO/Worker-10] Processed 707 revisions: [5419a4e4da9d9e37df35c1c9455024fe8170d2fa, ...] -Jun 11 20:33:32 worker01 python3[27823]: [2016-06-11 20:33:32,487: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[275183f6-ef9e-4533-aa72-322f080b76e1] succeeded in 536.1971881072968s: None -Jun 11 20:33:32 worker01 python3[27823]: [2016-06-11 20:33:32,522: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[8677e0f5-4938-4146-b164-720ec7294cb4] +Jun 11 20:33:32 worker01 python3[27823]: [2016-06-11 20:33:32,487: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[275183f6-ef9e-4533-aa72-322f080b76e1] succeeded in 536.1971881072968s: None +Jun 11 20:33:32 worker01 python3[27823]: [2016-06-11 20:33:32,522: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[8677e0f5-4938-4146-b164-720ec7294cb4] Jun 11 20:33:32 worker01 python3[27823]: [2016-06-11 20:33:32,816: INFO/Worker-10] [revision_start-revision_end]: [1-34523] Jun 11 20:33:32 worker01 python3[27823]: [2016-06-11 20:33:32,874: INFO/Worker-10] Processing {'remote_url': 'svn://svn.debian.org/svn/python-modules', 'uuid': b'771dd761-d7fa-0310-a302-f036d1c1ebb6', 'local_url': b'/tmp/tmp.jokuejpx.swh.loader/python-modules', 'swh-origin': 122}. Jun 11 20:34:43 worker01 python3[27823]: [2016-06-11 20:34:43,590: INFO/Worker-10] Processed 1000 revisions: [4ca7de178b8d929a6dfc12e113b3072730eeb4c3, ...] @@ -781,8 +781,8 @@ Jun 11 21:24:43 worker01 python3[27823]: [2016-06-11 21:24:43,586: INFO/Worker-10] Processed 1000 revisions: [8de5385844c1a8de253ad8945dc06259d11c7fc8, ...] Jun 11 21:27:21 worker01 python3[27823]: [2016-06-11 21:27:21,950: INFO/Worker-10] Processed 1000 revisions: [c0b63db3767754092c8c1fe92b07e66e77b7fed3, ...] Jun 11 21:28:34 worker01 python3[27823]: [2016-06-11 21:28:34,131: INFO/Worker-10] Processed 523 revisions: [ba52b091af3078562f7e7fc05c04b9469988e006, ...] -Jun 11 21:28:43 worker01 python3[27823]: [2016-06-11 21:28:43,356: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[8677e0f5-4938-4146-b164-720ec7294cb4] succeeded in 3310.805134777911s: None -Jun 11 21:28:43 worker01 python3[27823]: [2016-06-11 21:28:43,373: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[b3f34747-3575-4160-8543-6ddf23b8822e] +Jun 11 21:28:43 worker01 python3[27823]: [2016-06-11 21:28:43,356: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[8677e0f5-4938-4146-b164-720ec7294cb4] succeeded in 3310.805134777911s: None +Jun 11 21:28:43 worker01 python3[27823]: [2016-06-11 21:28:43,373: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[b3f34747-3575-4160-8543-6ddf23b8822e] Jun 11 21:28:43 worker01 python3[27823]: [2016-06-11 21:28:43,674: INFO/Worker-10] [revision_start-revision_end]: [1-49061] Jun 11 21:28:43 worker01 python3[27823]: [2016-06-11 21:28:43,688: INFO/Worker-10] Processing {'remote_url': 'svn://svn.debian.org/svn/pkg-gnome', 'uuid': b'db0db5de-e4c8-0310-9441-90abf70311f7', 'local_url': b'/tmp/tmp.kgkdf865.swh.loader/pkg-gnome', 'swh-origin': 27}. Jun 11 21:29:23 worker01 python3[27823]: [2016-06-11 21:29:23,231: INFO/Worker-10] Processed 1000 revisions: [29a3157f1d4a82955860a6fd3397bbd94573b555, ...] @@ -842,7 +842,7 @@ Jun 11 22:25:16 worker01 python3[27823]: [2016-06-11 22:25:16,759: INFO/Worker-10] Processed 1000 revisions: [f61326f5d12094860dead900d2d46d0c368f4e4b, ...] Jun 11 22:27:01 worker01 python3[27823]: [2016-06-11 22:27:01,152: INFO/Worker-10] Processed 1000 revisions: [754bfe4141e5a9165a7458945157791ac85e6ff9, ...] Jun 11 22:27:06 worker01 python3[27823]: [2016-06-11 22:27:06,833: INFO/Worker-10] Processed 61 revisions: [27e079019bc07ef84716c80c62ec53a39d806879, ...] -Jun 11 22:27:23 worker01 python3[27823]: [2016-06-11 22:27:23,340: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[b3f34747-3575-4160-8543-6ddf23b8822e] succeeded in 3519.9480575090274s: None +Jun 11 22:27:23 worker01 python3[27823]: [2016-06-11 22:27:23,340: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[b3f34747-3575-4160-8543-6ddf23b8822e] succeeded in 3519.9480575090274s: None #+END_SRC @@ -862,12 +862,12 @@ ** Log extract #+BEGIN_SRC sh -Jun 11 22:31:34 worker01 python3[13656]: [2016-06-11 22:31:34,315: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[9b6b05d0-f7fe-4799-86a0-f9ec7bd67ead] +Jun 11 22:31:34 worker01 python3[13656]: [2016-06-11 22:31:34,315: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[9b6b05d0-f7fe-4799-86a0-f9ec7bd67ead] Jun 11 22:31:34 worker01 python3[13656]: [2016-06-11 22:31:34,657: INFO/Worker-10] [revision_start-revision_end]: [1-145] Jun 11 22:31:34 worker01 python3[13656]: [2016-06-11 22:31:34,674: INFO/Worker-10] Processing {'remote_url': 'svn://svn.debian.org/svn/pkg-fox', 'local_url': b'/tmp/tmp.lod3o5u5.swh.loader/pkg-fox', 'uuid': b'd908f651-7add-0310-a5d1-c7ac9dfebe41', 'swh-origin': 4}. Jun 11 22:33:27 worker01 python3[13656]: [2016-06-11 22:33:27,415: INFO/Worker-10] Processed 145 revisions: [1dda85506a12af80c5a701a02aba5a02c703642f, ...] -Jun 11 22:33:31 worker01 python3[13656]: [2016-06-11 22:33:31,508: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[9b6b05d0-f7fe-4799-86a0-f9ec7bd67ead] succeeded in 117.17479287087917s: None -Jun 11 22:33:31 worker01 python3[13656]: [2016-06-11 22:33:31,552: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[f0375fb9-cf8c-40b5-9e04-db9e37fd9ecb] +Jun 11 22:33:31 worker01 python3[13656]: [2016-06-11 22:33:31,508: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[9b6b05d0-f7fe-4799-86a0-f9ec7bd67ead] succeeded in 117.17479287087917s: None +Jun 11 22:33:31 worker01 python3[13656]: [2016-06-11 22:33:31,552: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[f0375fb9-cf8c-40b5-9e04-db9e37fd9ecb] Jun 11 22:33:31 worker01 python3[13656]: [2016-06-11 22:33:31,856: INFO/Worker-10] [revision_start-revision_end]: [1-6073] Jun 11 22:33:31 worker01 python3[13656]: [2016-06-11 22:33:31,875: INFO/Worker-10] Processing {'remote_url': 'svn://svn.debian.org/svn/glibc-bsd', 'local_url': b'/tmp/tmp.w1bf5rdz.swh.loader/glibc-bsd', 'uuid': b'ae44cbe4-c7d5-0310-ae45-95c72a56cd7d', 'swh-origin': 3}. Jun 11 22:36:19 worker01 python3[13656]: [2016-06-11 22:36:19,485: INFO/Worker-10] Processed 1000 revisions: [be6fe97464c0fedd9959073d07b2fda4cbedbe2d, ...] @@ -877,8 +877,8 @@ Jun 11 22:46:03 worker01 python3[13656]: [2016-06-11 22:46:03,775: INFO/Worker-10] Processed 1000 revisions: [b2c7a10f2127dd496048133480df041b0ab66865, ...] Jun 11 22:48:57 worker01 python3[13656]: [2016-06-11 22:48:57,500: INFO/Worker-10] Processed 1000 revisions: [382341a00301f36ceec9fca563aef85cb628b323, ...] Jun 11 22:49:15 worker01 python3[13656]: [2016-06-11 22:49:15,687: INFO/Worker-10] Processed 73 revisions: [512a9f720bd1af1581b09483846035cf292c52cd, ...] -Jun 11 22:49:34 worker01 python3[13656]: [2016-06-11 22:49:34,067: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[f0375fb9-cf8c-40b5-9e04-db9e37fd9ecb] succeeded in 962.4779040301219s: None -Jun 11 22:49:34 worker01 python3[13656]: [2016-06-11 22:49:34,087: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[60670885-84de-4884-937d-b7aecdbedcd5] +Jun 11 22:49:34 worker01 python3[13656]: [2016-06-11 22:49:34,067: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[f0375fb9-cf8c-40b5-9e04-db9e37fd9ecb] succeeded in 962.4779040301219s: None +Jun 11 22:49:34 worker01 python3[13656]: [2016-06-11 22:49:34,087: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[60670885-84de-4884-937d-b7aecdbedcd5] Jun 11 22:49:34 worker01 python3[13656]: [2016-06-11 22:49:34,371: INFO/Worker-10] [revision_start-revision_end]: [1-10707] Jun 11 22:49:34 worker01 python3[13656]: [2016-06-11 22:49:34,391: INFO/Worker-10] Processing {'remote_url': 'svn://svn.debian.org/svn/pkg-voip', 'local_url': b'/tmp/tmp.w9fxgpt4.swh.loader/pkg-voip', 'uuid': b'5e74be4b-f5d6-0310-a852-e9e2 3c5afa6a', 'swh-origin': 32}. @@ -893,8 +893,8 @@ Jun 11 23:28:30 worker01 python3[13656]: [2016-06-11 23:28:30,510: INFO/Worker-10] Processed 1000 revisions: [943042e9d1113406483a2a5b6f39023935c0532a, ...] Jun 11 23:35:10 worker01 python3[13656]: [2016-06-11 23:35:10,918: INFO/Worker-10] Processed 1000 revisions: [81405fca1c7f928fcc6a2b137546b4f94f22551e, ...] Jun 11 23:39:53 worker01 python3[13656]: [2016-06-11 23:39:53,511: INFO/Worker-10] Processed 707 revisions: [5419a4e4da9d9e37df35c1c9455024fe8170d2fa, ...] -Jun 11 23:40:02 worker01 python3[13656]: [2016-06-11 23:40:02,978: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[60670885-84de-4884-937d-b7aecdbedcd5] succeeded in 3028.8734963517636s: None -Jun 11 23:40:02 worker01 python3[13656]: [2016-06-11 23:40:02,997: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[740dd7e1-3d96-4bc8-8fce-faef773acbb6] +Jun 11 23:40:02 worker01 python3[13656]: [2016-06-11 23:40:02,978: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[60670885-84de-4884-937d-b7aecdbedcd5] succeeded in 3028.8734963517636s: None +Jun 11 23:40:02 worker01 python3[13656]: [2016-06-11 23:40:02,997: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[740dd7e1-3d96-4bc8-8fce-faef773acbb6] Jun 11 23:40:03 worker01 python3[13656]: [2016-06-11 23:40:03,305: INFO/Worker-10] [revision_start-revision_end]: [1-34523] Jun 11 23:40:03 worker01 python3[13656]: [2016-06-11 23:40:03,324: INFO/Worker-10] Processing {'remote_url': 'svn://svn.debian.org/svn/python-modules', 'local_url': b'/tmp/tmp.g_iyx8bd.swh.loader/python-modules', 'uuid': b'771dd761-d7fa-03 10-a302-f036d1c1ebb6', 'swh-origin': 122}. @@ -933,8 +933,8 @@ Jun 12 05:26:47 worker01 python3[13656]: [2016-06-12 05:26:47,593: INFO/Worker-10] Processed 1000 revisions: [8de5385844c1a8de253ad8945dc06259d11c7fc8, ...] Jun 12 05:46:27 worker01 python3[13656]: [2016-06-12 05:46:27,938: INFO/Worker-10] Processed 1000 revisions: [c0b63db3767754092c8c1fe92b07e66e77b7fed3, ...] Jun 12 05:57:00 worker01 python3[13656]: [2016-06-12 05:57:00,711: INFO/Worker-10] Processed 523 revisions: [ba52b091af3078562f7e7fc05c04b9469988e006, ...] -Jun 12 05:57:34 worker01 python3[13656]: [2016-06-12 05:57:34,284: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[740dd7e1-3d96-4bc8-8fce-faef773acbb6] succeeded in 22651.265359937213s: None -Jun 12 05:57:34 worker01 python3[13656]: [2016-06-12 05:57:34,299: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepositoryTsk[10b1e441-4ab0-4121-a27f-7d32f1312b1f] +Jun 12 05:57:34 worker01 python3[13656]: [2016-06-12 05:57:34,284: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[740dd7e1-3d96-4bc8-8fce-faef773acbb6] succeeded in 22651.265359937213s: None +Jun 12 05:57:34 worker01 python3[13656]: [2016-06-12 05:57:34,299: INFO/MainProcess] Received task: swh.loader.svn.tasks.LoadSvnRepository[10b1e441-4ab0-4121-a27f-7d32f1312b1f] Jun 12 05:57:34 worker01 python3[13656]: [2016-06-12 05:57:34,593: INFO/Worker-10] [revision_start-revision_end]: [1-49061] Jun 12 05:57:34 worker01 python3[13656]: [2016-06-12 05:57:34,623: INFO/Worker-10] Processing {'remote_url': 'svn://svn.debian.org/svn/pkg-gnome', 'local_url': b'/tmp/tmp.h36l0khh.swh.loader/pkg-gnome', 'uuid': b'db0db5de-e4c8-0310-9441-90 abf70311f7', 'swh-origin': 27}. @@ -988,5 +988,5 @@ Jun 12 17:28:04 worker01 python3[13656]: [2016-06-12 17:28:04,678: INFO/Worker-10] Processed 1000 revisions: [f61326f5d12094860dead900d2d46d0c368f4e4b, ...] Jun 12 18:01:24 worker01 python3[13656]: [2016-06-12 18:01:24,516: INFO/Worker-10] Processed 1000 revisions: [754bfe4141e5a9165a7458945157791ac85e6ff9, ...] Jun 12 18:04:03 worker01 python3[13656]: [2016-06-12 18:04:03,334: INFO/Worker-10] Processed 61 revisions: [27e079019bc07ef84716c80c62ec53a39d806879, ...] -Jun 12 18:04:43 worker01 python3[13656]: [2016-06-12 18:04:43,418: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[10b1e441-4ab0-4121-a27f-7d32f1312b1f] succeeded in 43629.105175915174s: None +Jun 12 18:04:43 worker01 python3[13656]: [2016-06-12 18:04:43,418: INFO/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[10b1e441-4ab0-4121-a27f-7d32f1312b1f] succeeded in 43629.105175915174s: None #+END_SRC diff --git a/errors.org b/errors.org --- a/errors.org +++ b/errors.org @@ -4,7 +4,7 @@ * Malformed xml exception when asking for log #+BEGIN_SRC sh -[2016-04-03 11:10:43,740: ERROR/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[8ffa1cf0-6e88-40f5-9918-f659de0810f3] raised unexpected: ClientError('The XML response contains invalid XML\nMalformed XML: not well-formed (invalid token)',) +[2016-04-03 11:10:43,740: ERROR/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[8ffa1cf0-6e88-40f5-9918-f659de0810f3] raised unexpected: ClientError('The XML response contains invalid XML\nMalformed XML: not well-formed (invalid token)',) Traceback (most recent call last): File "/usr/lib/python3/dist-packages/celery/app/trace.py", line 240, in trace_task R = retval = fun(*args, **kwargs) @@ -274,7 +274,7 @@ ??? #+BEGIN_SRC sh -[2016-04-03 12:45:41,468: ERROR/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepositoryTsk[9bf99330-4960-41ab-989f-3b172ffa92e6] raised unexpected: ValueError(ClientError("The node '/tmp/tmp.kra26_aa.swh.loader/dot-files/LICENSE.txt' was not found.",),) +[2016-04-03 12:45:41,468: ERROR/MainProcess] Task swh.loader.svn.tasks.LoadSvnRepository[9bf99330-4960-41ab-989f-3b172ffa92e6] raised unexpected: ValueError(ClientError("The node '/tmp/tmp.kra26_aa.swh.loader/dot-files/LICENSE.txt' was not found.",),) Traceback (most recent call last): File "/usr/lib/python3/dist-packages/celery/app/trace.py", line 240, in trace_task R = retval = fun(*args, **kwargs) diff --git a/swh/loader/svn/loader.py b/swh/loader/svn/loader.py --- a/swh/loader/svn/loader.py +++ b/swh/loader/svn/loader.py @@ -12,7 +12,6 @@ import shutil import tempfile -from swh.core import utils from swh.model import hashutil from swh.model.from_disk import Directory from swh.model.identifiers import identifier_to_bytes, revision_identifier @@ -22,7 +21,7 @@ from . import svn, converters from .utils import init_svn_repo_from_archive_dump -from .exception import SvnLoaderEventful, SvnLoaderUneventful +from .exception import SvnLoaderUneventful from .exception import SvnLoaderHistoryAltered @@ -51,7 +50,7 @@ TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.svn.' -class SWHSvnLoader(SWHLoader): +class SvnLoader(SWHLoader): """Swh svn loader to load an svn repository The repository is either remote or local. The loader deals with update on an already previously loaded repository. @@ -78,6 +77,15 @@ self.debug = self.config['debug'] self.last_seen_revision = None self.temp_directory = self.config['temp_directory'] + self.done = False + # internal state used to store swh objects + self._contents = [] + self._directories = [] + self._revisions = [] + self._snapshot = None + self._last_revision = None + self._visit_status = 'full' + self._load_status = 'uneventful' def pre_cleanup(self): """Cleanup potential dangling files from prior runs (e.g. OOM killed @@ -92,6 +100,10 @@ """Clean up the svn repository's working representation on disk. """ + if not hasattr(self, 'svnrepo'): + # could happen if `prepare` fails + # nothing to do in that case + return if self.debug: self.log.error('''NOT FOR PRODUCTION - debug flag activated Local repository not cleaned up for investigation: %s''' % ( @@ -114,21 +126,21 @@ self.svnrepo.clean_fs(local_dirname) return h - def get_svn_repo(self, svn_url, local_dirname, origin): + def get_svn_repo(self, svn_url, local_dirname, origin_id): """Instantiates the needed svnrepo collaborator to permit reading svn repository. Args: svn_url (str): the svn repository url to read from local_dirname (str): the local path on disk to compute data - origin (int): the corresponding origin + origin_id (int): the corresponding origin id Returns: Instance of :mod:`swh.loader.svn.svn` clients + """ - return svn.SWHSvnRepo( - svn_url, origin['id'], self.storage, - local_dirname=local_dirname) + return svn.SvnRepo(svn_url, + local_dirname=local_dirname, origin_id=origin_id) def swh_latest_snapshot_revision(self, origin_id, previous_swh_revision=None): @@ -165,7 +177,12 @@ else: return {} - revs = list(storage.revision_get([previous_swh_revision])) + if isinstance(previous_swh_revision, dict): + swh_id = previous_swh_revision['id'] + else: + swh_id = previous_swh_revision + + revs = list(storage.revision_get([swh_id])) if revs: return { 'snapshot': latest_snap, @@ -192,11 +209,8 @@ The swh revision corresponding to the svn revision. """ - return converters.build_swh_revision(rev, - commit, - self.svnrepo.uuid, - dir_id, - parents) + return converters.build_swh_revision( + rev, commit, self.svnrepo.uuid, dir_id, parents) def check_history_not_altered(self, svnrepo, revision_start, swh_rev): """Given a svn repository, check if the history was not tampered with. @@ -218,28 +232,64 @@ return swh_revision_id == revision_id - def process_repository(self, origin_visit, - last_known_swh_revision=None, - start_from_scratch=False): - """The main idea of this function is to: + def _init_from(self, partial_swh_revision, previous_swh_revision): + """Function to determine from where to start from. - - iterate over the svn commit logs - - extract the svn commit log metadata - - compute the hashes from the current directory down to the file - - compute the equivalent swh revision - - send all those objects for storage - - create an swh occurrence pointing to the last swh revision seen - - send that occurrence for storage in swh-storage. + Args: + partial_swh_revision (dict): A known revision from which + the previous loading did not + finish. + known_previous_revision (dict): A known revision from + which the previous loading + did finish. + + Returns: + The revision from which to start or None if nothing (fresh + start). """ - svnrepo = self.svnrepo + if partial_swh_revision and not previous_swh_revision: + return partial_swh_revision + if not partial_swh_revision and previous_swh_revision: + return previous_swh_revision + if partial_swh_revision and previous_swh_revision: + # will determine from which to start from + extra_headers1 = dict( + partial_swh_revision['metadata']['extra_headers']) + extra_headers2 = dict( + previous_swh_revision['metadata']['extra_headers']) + rev_start1 = int(extra_headers1['svn_revision']) + rev_start2 = int(extra_headers2['svn_revision']) + if rev_start1 <= rev_start2: + return previous_swh_revision + return partial_swh_revision + + return None + + def start_from(self, last_known_swh_revision=None, + start_from_scratch=False): + """Determine from where to start the loading. + + Args: + last_known_swh_revision (dict): Last know swh revision or None + start_from_scratch (bool): To start loading from scratch or not + + Returns: + tuple (revision_start, revision_end, revision_parents) + + Raises: - revision_head = svnrepo.head_revision() + SvnLoaderHistoryAltered: When a hash divergence has been + detected (should not happen) + SvnLoaderUneventful: Nothing changed since last visit + + """ + revision_head = self.svnrepo.head_revision() if revision_head == 0: # empty repository case revision_start = 0 revision_end = 0 else: # default configuration - revision_start = svnrepo.initial_revision() + revision_start = self.svnrepo.initial_revision() revision_end = revision_head revision_parents = { @@ -254,8 +304,8 @@ swh_rev = None # Determine from which known revision to start - swh_rev = self.init_from(last_known_swh_revision, - previous_swh_revision=swh_rev) + swh_rev = self._init_from(last_known_swh_revision, + previous_swh_revision=swh_rev) if swh_rev: # Yes, we know a previous revision. Try and update it. extra_headers = dict(swh_rev['metadata']['extra_headers']) @@ -265,16 +315,16 @@ } self.log.debug('svn export --ignore-keywords %s@%s' % ( - svnrepo.remote_url, + self.svnrepo.remote_url, revision_start)) if swh_rev and not self.check_history_not_altered( - svnrepo, + self.svnrepo, revision_start, swh_rev): msg = 'History of svn %s@%s altered. ' \ 'Skipping...' % ( - svnrepo.remote_url, revision_start) + self.svnrepo.remote_url, revision_start) raise SvnLoaderHistoryAltered(msg) # now we know history is ok, we start at next revision @@ -284,33 +334,33 @@ revision_parents[revision_start] = [swh_rev['id']] if revision_start > revision_end and revision_start is not 1: - msg = '%s@%s already injected.' % (svnrepo.remote_url, + msg = '%s@%s already injected.' % (self.svnrepo.remote_url, revision_end) raise SvnLoaderUneventful(msg) self.log.info('Processing revisions [%s-%s] for %s' % ( - revision_start, revision_end, svnrepo)) + revision_start, revision_end, self.svnrepo)) - # process and store revision to swh (sent by by blocks of - # 'revision_packet_size') - return self.process_swh_revisions( - svnrepo, revision_start, revision_end, revision_parents) + return revision_start, revision_end, revision_parents def process_svn_revisions(self, svnrepo, revision_start, revision_end, revision_parents): - """Process revisions from revision_start to revision_end and send to - swh for storage. + """Process svn revisions from revision_start to revision_end. - At each svn revision, checkout the repository, compute the - tree hash and blobs and send for swh storage to store. - Then computes and yields the swh revision. + At each svn revision, apply new diffs and simultaneously + compute swh hashes. This yields those computed swh hashes as + a tuple (contents, directories, revision). - Note that at every self.check_revision, an svn export is done - and a hash tree is computed to check that no divergence - occurred. + Note that at every `self.check_revision`, a supplementary + check takes place to check for hash-tree divergence (related + T570). Yields: - swh revision as a dictionary with keys, sha1_git, sha1, etc... + tuple (contents, directories, revision) of dict as a + dictionary with keys, sha1_git, sha1, etc... + + Raises: + ValueError in case of a hash divergence detection """ gen_revs = svnrepo.swh_hash_data_per_revision( @@ -321,9 +371,8 @@ for rev, nextrev, commit, new_objects, root_directory in gen_revs: count += 1 # Send the associated contents/directories - self.maybe_load_contents(new_objects.get('content', {}).values()) - self.maybe_load_directories( - new_objects.get('directory', {}).values()) + _contents = new_objects.get('content', {}).values() + _directories = new_objects.get('directory', {}).values() # compute the fs tree's checksums dir_id = root_directory.hash @@ -337,6 +386,7 @@ hashutil.hash_to_hex(swh_revision['id']), hashutil.hash_to_hex(dir_id))) + # FIXME: Is that still necessary? Rationale: T570 is now closed if (count % self.check_revision) == 0: # hash computation check self.log.debug('Checking hash computations on revision %s...' % rev) @@ -352,70 +402,7 @@ if nextrev: revision_parents[nextrev] = [swh_revision['id']] - yield swh_revision - - def process_swh_revisions(self, - svnrepo, - revision_start, - revision_end, - revision_parents): - """Process and store revision to swh (sent by blocks of - revision_packet_size) - - Returns: - The latest revision stored. - - """ - try: - swh_revision_gen = self.process_svn_revisions(svnrepo, - revision_start, - revision_end, - revision_parents) - revs = [] - for revisions in utils.grouper( - swh_revision_gen, - self.config['revision_packet_size']): - revs = list(revisions) - self.maybe_load_revisions(revs) - last_revision = revs[-1] - self.log.debug('Processed %s revisions: [..., %s]' % ( - len(revs), hashutil.hash_to_hex(last_revision['id']))) - self.last_seen_revision = last_revision - except Exception as e: - if revs: - # flush remaining revisions - self.maybe_load_revisions(revs) - # Take the last one as the last known revisions - known_swh_rev = revs[-1] - elif self.last_seen_revision: # We'll try to make a snapshot - known_swh_rev = self.last_seen_revision - else: - raise - - _id = known_swh_rev.get('id') - if not _id: - _id = _revision_id(known_swh_rev) - - # Then notify something is wrong, and we stopped at that rev. - raise SvnLoaderEventful(e, swh_revision={ - 'id': _id, - }) - - return last_revision - - def process_swh_snapshot(self, revision=None, snapshot=None): - """Create the snapshot either from existing snapshot or revision. - - """ - if snapshot: - snap = snapshot - elif revision: - snap = build_swh_snapshot(revision['id']) - snap['id'] = identifier_to_bytes(snapshot_identifier(snap)) - else: - return None - self.log.debug('snapshot: %s' % snap) - self.maybe_load_snapshot(snap) + yield _contents, _directories, swh_revision def prepare_origin_visit(self, *, svn_url, visit_date=None, origin_url=None, **kwargs): @@ -429,8 +416,7 @@ swh_revision=None, start_from_scratch=False, **kwargs): self.start_from_scratch = start_from_scratch if swh_revision: - self.last_known_swh_revision = hashutil.hash_to_bytes( - swh_revision) + self.last_known_swh_revision = swh_revision else: self.last_known_swh_revision = None @@ -445,81 +431,113 @@ prefix=TEMPORARY_DIR_PREFIX_PATTERN, dir=self.temp_directory) - self.svnrepo = self.get_svn_repo(svn_url, local_dirname, self.origin) + self.svnrepo = self.get_svn_repo(svn_url, local_dirname, self.origin_id) + try: + revision_start, revision_end, revision_parents = self.start_from( + self.last_known_swh_revision, self.start_from_scratch) + self.swh_revision_gen = self.process_svn_revisions( + self.svnrepo, revision_start, revision_end, revision_parents) + except SvnLoaderUneventful as e: + self.log.warn(e) + if self.latest_snapshot and 'snapshot' in self.latest_snapshot: + self._snapshot = self.latest_snapshot['snapshot'] + self.done = True + except SvnLoaderHistoryAltered as e: + self.log.error(e) + self.done = True + self._visit_status = 'partial' def fetch_data(self): - """We need to fetch and stream the data to store directly. So - fetch_data do actually nothing. The method ``store_data`` below is in - charge to do everything, fetch and store. + """Fetching svn revision information. - """ - pass + This will apply svn revision as patch on disk, and at the same + time, compute the swh hashes. - def store_data(self): - """We need to fetch and stream the data to store directly because - there is too much data and state changes. Everything is - intertwined together (We receive patch and apply on disk and - compute at the hashes at the same time) + In effect, fetch_data fetches those data and compute the + necessary swh objects. It's then stored in the internal state + instance variables (initialized in `_prepare_state`). - So every data to fetch and store is done here. + This is up to `store_data` to actually discuss with the + storage to store those objects. - Note: - origin_visit and last_known_swh_revision must have been set in the - prepare method. + Returns: + bool: True to continue fetching data (next svn revision), + False to stop. """ - origin_visit = {'origin': self.origin_id, 'visit': self.visit} + data = None + if self.done: + return False + try: - latest_rev = self.process_repository( - origin_visit, - last_known_swh_revision=self.last_known_swh_revision, - start_from_scratch=self.start_from_scratch) - except SvnLoaderEventful as e: - latest_rev = e.swh_revision - self.process_swh_snapshot(revision=latest_rev) - raise - except Exception as e: - if self.latest_snapshot and 'snapshot' in self.latest_snapshot: - snapshot = self.latest_snapshot['snapshot'] - self.process_swh_snapshot(snapshot=snapshot) - raise - else: - self.process_swh_snapshot(revision=latest_rev) + data = next(self.swh_revision_gen) + self._load_status = 'eventful' + except StopIteration: + self.done = True + self._visit_status = 'full' + return False # Stopping iteration + except Exception as e: # Potential: svn:external, i/o error... + self.done = True + self._visit_status = 'partial' + return False # Stopping iteration + self._contents, self._directories, revision = data + if revision: + self._last_revision = revision + self._revisions.append(revision) + return True # next svn revision - def init_from(self, partial_swh_revision, previous_swh_revision): - """Function to determine from where to start from. + def store_data(self): + """We store the data accumulated in internal instance variable. If + the iteration over the svn revisions is done, we create the + snapshot and flush to storage the data. - Args: - partial_swh_revision: A known revision from which - the previous loading did not finish. - known_previous_revision: A known revision from which the - previous loading did finish. + This also resets the internal instance variable state. - Returns: - The revision from which to start or None if nothing (fresh - start). + """ + self.maybe_load_contents(self._contents) + self.maybe_load_directories(self._directories) + self.maybe_load_revisions(self._revisions) + + if self.done: # finish line, snapshot! + self.generate_and_load_snapshot(revision=self._last_revision, + snapshot=self._snapshot) + self.flush() + + self._contents = [] + self._directories = [] + self._revisions = [] + + def generate_and_load_snapshot(self, revision=None, snapshot=None): + """Create the snapshot either from existing revision or snapshot. + + Revision (supposedly new) has priority over the snapshot + (supposedly existing one). + + Args: + revision (dict): Last revision seen if any (None by default) + snapshot (dict): Snapshot to use if any (None by default) """ - if partial_swh_revision and not previous_swh_revision: - return partial_swh_revision - if not partial_swh_revision and previous_swh_revision: - return previous_swh_revision - if partial_swh_revision and previous_swh_revision: - # will determine from which to start from - extra_headers1 = dict( - partial_swh_revision['metadata']['extra_headers']) - extra_headers2 = dict( - previous_swh_revision['metadata']['extra_headers']) - rev_start1 = int(extra_headers1['svn_revision']) - rev_start2 = int(extra_headers2['svn_revision']) - if rev_start1 <= rev_start2: - return previous_swh_revision - return partial_swh_revision + if revision: # Priority to the revision + snap = build_swh_snapshot(revision['id']) + snap['id'] = identifier_to_bytes(snapshot_identifier(snap)) + elif snapshot: # Fallback to prior snapshot + snap = snapshot + else: + return None + self.log.debug('snapshot: %s' % snap) + self.maybe_load_snapshot(snap) - return None + def load_status(self): + return { + 'status': self._load_status, + } + + def visit_status(self): + return self._visit_status -class SWHSvnLoaderFromDumpArchive(SWHSvnLoader): +class SvnLoaderFromDumpArchive(SvnLoader): """Uncompress an archive containing an svn dump, mount the svn dump as an svn repository and load said repository. diff --git a/swh/loader/svn/producer.py b/swh/loader/svn/producer.py --- a/swh/loader/svn/producer.py +++ b/swh/loader/svn/producer.py @@ -109,7 +109,7 @@ """Produce svn urls to celery queue """ - task = get_task('swh.loader.svn.tasks.LoadSWHSvnRepositoryTsk') + task = get_task('swh.loader.svn.tasks.LoadSvnRepository') if synchroneous: fn = task @@ -148,7 +148,7 @@ """Produce svn dumps to celery queue """ - task = get_task('swh.loader.svn.tasks.MountAndLoadSvnRepositoryTsk') + task = get_task('swh.loader.svn.tasks.MountAndLoadSvnRepository') if synchroneous: fn = task diff --git a/swh/loader/svn/ra.py b/swh/loader/svn/ra.py --- a/swh/loader/svn/ra.py +++ b/swh/loader/svn/ra.py @@ -111,7 +111,7 @@ EOL_STYLE = {} -class SWHFileEditor: +class FileEditor: """File Editor in charge of updating file on disk and memory objects. """ @@ -235,10 +235,10 @@ data=True) -class BaseDirSWHEditor: +class BaseDirEditor: """Base class implementation of dir editor. - see :class:`SWHDirEditor` for an implementation that hashes every + see :class:`DirEditor` for an implementation that hashes every directory encountered. Instantiate a new class inheriting from this class and define the following @@ -303,7 +303,7 @@ """ path = os.fsencode(args[0]) self.directory[path] = Content() - return SWHFileEditor(self.directory, rootpath=self.rootpath, path=path) + return FileEditor(self.directory, rootpath=self.rootpath, path=path) def add_file(self, path, copyfrom_path=None, copyfrom_rev=-1): """Creating a new file. @@ -311,7 +311,7 @@ """ path = os.fsencode(path) self.directory[path] = Content() - return SWHFileEditor(self.directory, self.rootpath, path) + return FileEditor(self.directory, self.rootpath, path) def change_prop(self, key, value): """Change property callback on directory. @@ -334,7 +334,7 @@ self.update_checksum() -class SWHDirEditor(BaseDirSWHEditor): +class DirEditor(BaseDirEditor): """Directory Editor in charge of updating directory hashes computation. This implementation includes empty folder in the hash computation. @@ -366,12 +366,12 @@ return self -class SWHEditor: - """SWH Editor in charge of replaying svn events and computing objects - along. +class Editor: + """Editor in charge of replaying svn events and computing objects + along. - This implementation accounts for empty folder during hash - computations. + This implementation accounts for empty folder during hash + computations. """ def __init__(self, rootpath, directory): @@ -388,10 +388,10 @@ pass def open_root(self, base_revnum): - return SWHDirEditor(self.directory, rootpath=self.rootpath) + return DirEditor(self.directory, rootpath=self.rootpath) -class SWHReplay: +class Replay: """Replay class. """ def __init__(self, conn, rootpath, directory=None): @@ -400,7 +400,7 @@ if directory is None: directory = Directory() self.directory = directory - self.editor = SWHEditor(rootpath=rootpath, directory=directory) + self.editor = Editor(rootpath=rootpath, directory=directory) def replay(self, rev): """Replay svn actions between rev and rev+1. @@ -445,7 +445,7 @@ @click.option('--cleanup/--nocleanup', default=True, help="Indicates whether to cleanup disk when done or not.") def main(local_url, svn_url, revision_start, revision_end, debug, cleanup): - """Script to present how to use SWHReplay class. + """Script to present how to use Replay class. """ conn = RemoteAccess(svn_url.encode('utf-8'), @@ -466,7 +466,7 @@ revision_end = min(revision_end, revision_end_max) try: - replay = SWHReplay(conn, rootpath) + replay = Replay(conn, rootpath) for rev in range(revision_start, revision_end+1): objects = replay.compute_hashes(rev) diff --git a/swh/loader/svn/svn.py b/swh/loader/svn/svn.py --- a/swh/loader/svn/svn.py +++ b/swh/loader/svn/svn.py @@ -9,6 +9,7 @@ """ +import logging import os import tempfile import shutil @@ -24,19 +25,17 @@ DEFAULT_AUTHOR_MESSAGE = '' -class SWHSvnRepo: - """SWH's svn repository representation. +class SvnRepo: + """Svn repository representation. Args: remote_url (str): origin_id (int): Associated origin identifier - storage (Storage): Storage to use to execute storage statements local_dirname (str): Path to write intermediary svn action results """ - def __init__(self, remote_url, origin_id, storage, local_dirname): + def __init__(self, remote_url, origin_id, local_dirname): self.remote_url = remote_url.rstrip('/') - self.storage = storage self.origin_id = origin_id auth = Auth([get_username_provider()]) @@ -53,7 +52,7 @@ 'utf-8') self.uuid = self.conn.get_uuid().encode('utf-8') - self.swhreplay = ra.SWHReplay(conn=self.conn, rootpath=self.local_url) + self.swhreplay = ra.Replay(conn=self.conn, rootpath=self.local_url) def __str__(self): return str({ @@ -242,7 +241,7 @@ save_path=True) # Update the replay collaborator with the right state - self.swhreplay = ra.SWHReplay( + self.swhreplay = ra.Replay( conn=self.conn, rootpath=self.local_url, directory=directory) @@ -261,7 +260,7 @@ used for svn repository loading. """ - if local_dirname: - shutil.rmtree(local_dirname) - else: - shutil.rmtree(self.local_dirname) + dirname = local_dirname if local_dirname else self.local_dirname + if os.path.exists(dirname): + logging.debug('cleanup %s' % dirname) + shutil.rmtree(dirname) diff --git a/swh/loader/svn/tasks.py b/swh/loader/svn/tasks.py --- a/swh/loader/svn/tasks.py +++ b/swh/loader/svn/tasks.py @@ -5,10 +5,10 @@ from swh.scheduler.task import Task -from .loader import SWHSvnLoader, SWHSvnLoaderFromDumpArchive +from .loader import SvnLoader, SvnLoaderFromDumpArchive -class LoadSWHSvnRepositoryTsk(Task): +class LoadSvnRepository(Task): """Import one svn repository to Software Heritage. """ @@ -30,12 +30,12 @@ - destination_path (str): (mandatory) root directory to locally retrieve svn's data - origin_url (str): Optional original url override - - swh_revision (dict): (optional) extra SWH revision hex to + - swh_revision (dict): (optional) extra revision hex to start from. see swh.loader.svn.SvnLoader.process docstring """ - loader = SWHSvnLoader() + loader = SvnLoader() loader.log = self.log return loader.load( svn_url=svn_url, @@ -46,7 +46,7 @@ start_from_scratch=start_from_scratch) -class MountAndLoadSvnRepositoryTsk(Task): +class MountAndLoadSvnRepository(Task): task_queue = 'swh_loader_svn_mount_and_load' def run_task(self, *, archive_path, origin_url=None, visit_date=None, @@ -56,7 +56,7 @@ 3. Clean up mounted svn repository archive. """ - loader = SWHSvnLoaderFromDumpArchive(archive_path) + loader = SvnLoaderFromDumpArchive(archive_path) loader.log = self.log return loader.load(svn_url='file://%s' % loader.repo_path, origin_url=origin_url, diff --git a/swh/loader/svn/tests/test_base.py b/swh/loader/svn/tests/test_base.py --- a/swh/loader/svn/tests/test_base.py +++ b/swh/loader/svn/tests/test_base.py @@ -15,7 +15,7 @@ @attr('fs') -class BaseTestSvnLoader(unittest.TestCase): +class BaseSvnLoaderTest(unittest.TestCase): """Base test loader class. In its setup, it's uncompressing a local svn mirror to /tmp. diff --git a/swh/loader/svn/tests/test_converters.py b/swh/loader/svn/tests/test_converters.py --- a/swh/loader/svn/tests/test_converters.py +++ b/swh/loader/svn/tests/test_converters.py @@ -54,7 +54,7 @@ }) -class TestAuthorSWHConverters(unittest.TestCase): +class TestAuthorConverters(unittest.TestCase): @istest def svn_author_to_swh_person(self): """The author should have name, email and fullname filled. @@ -94,7 +94,7 @@ }) -class TestSWHRevisionConverters(unittest.TestCase): +class TestRevisionConverters(unittest.TestCase): @istest def build_swh_revision_default(self): """This should build the swh revision with the swh revision's extra @@ -214,7 +214,7 @@ }) -class ConvertSWHDate(unittest.TestCase): +class ConvertDate(unittest.TestCase): @istest def svn_date_to_swh_date(self): """The timestamp should not be tampered with and include the diff --git a/swh/loader/svn/tests/test_loader.py b/swh/loader/svn/tests/test_loader.py --- a/swh/loader/svn/tests/test_loader.py +++ b/swh/loader/svn/tests/test_loader.py @@ -4,18 +4,16 @@ # See top-level LICENSE file for more information from nose.tools import istest -from test_base import BaseTestSvnLoader +from test_base import BaseSvnLoaderTest from unittest import TestCase from swh.model import hashutil from swh.loader.svn.loader import build_swh_snapshot, DEFAULT_BRANCH -from swh.loader.svn.loader import SWHSvnLoader -from swh.loader.svn.exception import SvnLoaderEventful, SvnLoaderUneventful -from swh.loader.svn.exception import SvnLoaderHistoryAltered +from swh.loader.svn.loader import SvnLoader -class TestSWHSnapshot(TestCase): +class TestSnapshot(TestCase): @istest def build_swh_snapshot(self): actual_snap = build_swh_snapshot('revision-id') @@ -36,11 +34,11 @@ # Only for testing purposes. -class TestSvnLoader: +class LoaderNoStorage: """Mixin class to inhibit the persistence and keep in memory the data sent for storage. - cf. SWHSvnLoaderNoStorage + cf. SvnLoaderNoStorage """ def __init__(self): @@ -49,6 +47,8 @@ self.all_directories = [] self.all_revisions = [] self.all_releases = [] + self.all_snapshots = [] + # Check at each svn revision that the hash tree computation # does not diverge self.check_revision = 10 @@ -58,6 +58,7 @@ 'directory': self.all_directories, 'revision': self.all_revisions, 'release': self.all_releases, + 'snapshot': self.all_snapshots, } def _add(self, type, l): @@ -86,6 +87,24 @@ def maybe_load_releases(self, releases): raise ValueError('If called, the test must break.') + def maybe_load_snapshot(self, snapshot): + self._add('snapshot', [snapshot]) + + def _store_origin_visit(self): + pass + + def open_fetch_history(self): + pass + + def close_fetch_history_success(self, fetch_history_id): + pass + + def close_fetch_history_failure(self, fetch_history_id): + pass + + def update_origin_visit(self, origin_id, visit, status): + pass + # Override to do nothing at the end def close_failure(self): pass @@ -93,16 +112,29 @@ def close_success(self): pass - def prepare(self, *args, **kwargs): - # Override to only prepare the svn repository - self.svnrepo = self.get_svn_repo(*args) - origin_id = 10 - self.latest_snapshot = self.swh_latest_snapshot_revision( - origin_id, None) + def pre_cleanup(self): + pass -class SWHSvnLoaderNoStorage(TestSvnLoader, SWHSvnLoader): - """An SWHSVNLoader with no persistence. +class LoaderWithState: + """Additional state setup (bypassed by some override for test purposes) + + """ + def __init__(self): + super().__init__() + self.origin = { + 'id': 1, + 'url': '/dev/null', + 'type': 'svn', + } + self.visit = { + 'origin': 1, + 'visit': 1, + } + + +class SvnLoaderNoStorage(LoaderNoStorage, LoaderWithState, SvnLoader): + """An SVNLoader with no persistence. Context: Load a new svn repository using the swh policy (so no update). @@ -115,8 +147,8 @@ return {} -class SWHSvnLoaderUpdateNoStorage(TestSvnLoader, SWHSvnLoader): - """An SWHSVNLoader with no persistence. +class SvnLoaderUpdateNoStorage(LoaderNoStorage, LoaderWithState, SvnLoader): + """An SVNLoader with no persistence. Context: Load a known svn repository using the swh policy. @@ -133,11 +165,11 @@ Check the following for explanation about the hashes: - test_loader.org for (swh policy). - - cf. SWHSvnLoaderITTest + - cf. SvnLoaderITTest """ return { - 'snapshot': None, + 'snapshot': 'something', # need a snapshot of sort 'revision': { 'id': hashutil.hash_to_bytes( '4876cb10aec6f708f7466dddf547567b65f6c39c'), @@ -157,10 +189,9 @@ } -class SWHSvnLoaderUpdateHistoryAlteredNoStorage(TestSvnLoader, SWHSvnLoader): - """An SWHSVNLoader with no persistence. - - Context: Load a known svn repository using the swh policy with its +class SvnLoaderUpdateHistoryAlteredNoStorage(LoaderNoStorage, LoaderWithState, + SvnLoader): + """Context: Load a known svn repository using the swh policy with its history altered so we do not update it. """ @@ -170,7 +201,7 @@ Check the following for explanation about the hashes: - test_loader.org for (swh policy). - - cf. SWHSvnLoaderITTest + - cf. SvnLoaderITTest """ return { @@ -195,28 +226,23 @@ } -class SWHSvnLoaderNewRepositoryITTest(BaseTestSvnLoader): +class SvnLoaderITest1(BaseSvnLoaderTest): + """Load an unknown svn repository results in new data. + + """ def setUp(self): super().setUp() - - self.origin = {'id': 2, 'type': 'svn', 'url': 'file:///dev/null'} - - self.origin_visit = { - 'origin': self.origin['id'], - 'visit': 2, - } - - self.loader = SWHSvnLoaderNoStorage() - self.loader.prepare( - self.svn_mirror_url, self.destination_path, self.origin) + self.loader = SvnLoaderNoStorage() @istest - def process_repository(self): - """Process a new repository with swh policy should be ok. + def load(self): + """Load a new repository results in new swh object and snapshot """ # when - self.loader.process_repository(self.origin_visit) + self.loader.load( + svn_url=self.svn_mirror_url, + destination_path=self.destination_path) # then self.assertEquals(len(self.loader.all_revisions), 6) @@ -236,95 +262,100 @@ } self.assertRevisionsOk(expected_revisions) + self.assertEquals(len(self.loader.all_snapshots), 1) + # FIXME: Check the snapshot's state + # self.assertEquals(self.loader.all_snapshots[0], {}) + self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) + self.assertEqual(self.loader.visit_status(), 'full') -class SWHSvnLoaderUpdateWithNoChangeITTest(BaseTestSvnLoader): +class SvnLoaderITest2(BaseSvnLoaderTest): + """Load a visited repository with no new change results in no data + change. + + """ def setUp(self): super().setUp() - - self.origin = {'id': 2, 'type': 'svn', 'url': 'file:///dev/null'} - - self.origin_visit = { - 'origin': self.origin['id'], - 'visit': 3, - } - - self.loader = SWHSvnLoaderUpdateNoStorage() - self.loader.prepare( - self.svn_mirror_url, self.destination_path, self.origin) + self.loader = SvnLoaderUpdateNoStorage() @istest - def process_repository(self): - """Process a known repository with swh policy and no new data should - be ok. + def load(self): + """Load a repository without new changes results in same snapshot """ # when - with self.assertRaises(SvnLoaderUneventful): - self.loader.process_repository(self.origin_visit) + self.loader.load( + svn_url=self.svn_mirror_url, + destination_path=self.destination_path) # then + + self.assertEquals(len(self.loader.all_contents), 0) + self.assertEquals(len(self.loader.all_directories), 0) self.assertEquals(len(self.loader.all_revisions), 0) self.assertEquals(len(self.loader.all_releases), 0) + self.assertEquals(len(self.loader.all_snapshots), 1) + # FIXME: Check the snapshot's state + # self.assertEquals(self.loader.all_snapshots[0], {}) + self.assertEqual(self.loader.load_status(), {'status': 'uneventful'}) + self.assertEqual(self.loader.visit_status(), 'full') + +class SvnLoaderITest3(BaseSvnLoaderTest): + """In this scenario, the dump has been tampered with to modify the + commit log. This results in a hash divergence which is + detected at startup. -class SWHSvnLoaderUpdateWithHistoryAlteredITTest(BaseTestSvnLoader): + In effect, that stops the loading and do nothing. + + """ def setUp(self): # the svn repository pkg-gourmet has been updated with changes super().setUp(archive_name='pkg-gourmet-with-updates.tgz') - - self.origin = {'id': 2, 'type': 'svn', 'url': 'file:///dev/null'} - - self.origin_visit = { - 'origin': self.origin['id'], - 'visit': 4, - } - - self.loader = SWHSvnLoaderUpdateHistoryAlteredNoStorage() - self.loader.prepare( - self.svn_mirror_url, self.destination_path, self.origin) + self.loader = SvnLoaderUpdateHistoryAlteredNoStorage() @istest - def process_repository(self): - """Process a known repository with swh policy and history altered - should stop and do nothing. + def load(self): + """Load known repository with history altered should do nothing """ # when - with self.assertRaises(SvnLoaderHistoryAltered): - self.loader.args = (self.origin_visit,) - self.loader.process_repository(self.origin_visit) + self.loader.load(svn_url=self.svn_mirror_url, + destination_path=self.destination_path) # then # we got the previous run's last revision (rev 6) # so 2 news + 1 old + self.assertEquals(len(self.loader.all_contents), 0) + self.assertEquals(len(self.loader.all_directories), 0) self.assertEquals(len(self.loader.all_revisions), 0) self.assertEquals(len(self.loader.all_releases), 0) + self.assertEquals(len(self.loader.all_snapshots), 0) + # FIXME: Check the snapshot's state + # self.assertEquals(self.loader.all_snapshots[0], {}) + self.assertEqual(self.loader.load_status(), {'status': 'uneventful'}) + self.assertEqual(self.loader.visit_status(), 'partial') + +class SvnLoaderITest4(BaseSvnLoaderTest): + """In this scenario, the repository has been updated with new changes. + The loading visit should result in new objects stored and 1 new + snapshot. -class SWHSvnLoaderUpdateWithChangesITTest(BaseTestSvnLoader): + """ def setUp(self): # the svn repository pkg-gourmet has been updated with changes super().setUp(archive_name='pkg-gourmet-with-updates.tgz') - - self.origin = {'id': 2, 'type': 'svn', 'url': 'file:///dev/null'} - - self.origin_visit = { - 'origin': self.origin['id'], - 'visit': 5, - } - - self.loader = SWHSvnLoaderUpdateNoStorage() - self.loader.prepare( - self.svn_mirror_url, self.destination_path, self.origin) + self.loader = SvnLoaderUpdateNoStorage() @istest def process_repository(self): - """Process updated repository should yield new revisions + """Process updated repository should yield new objects """ # when - self.loader.process_repository(self.origin_visit) + self.loader.load(svn_url=self.svn_mirror_url, + destination_path=self.destination_path) # then # we got the previous run's last revision (rev 6) @@ -346,31 +377,34 @@ self.assertRevisionsOk(expected_revisions) + self.assertEquals(len(self.loader.all_snapshots), 1) + # FIXME: Check the snapshot's state + # self.assertEquals(self.loader.all_snapshots[0], {}) + self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) + self.assertEqual(self.loader.visit_status(), 'full') -class SWHSvnLoaderUpdateWithChangesStartFromScratchITTest(BaseTestSvnLoader): - def setUp(self): - # the svn repository pkg-gourmet has been updated with changes - super().setUp(archive_name='pkg-gourmet-with-updates.tgz') - self.origin = {'id': 2, 'type': 'svn', 'url': 'file:///dev/null'} +class SvnLoaderITTest5(BaseSvnLoaderTest): + """Context: - self.origin_visit = { - 'origin': self.origin['id'], - 'visit': 5, - } + - Repository already injected with successfull data + - New visit from scratch done with successfull load - self.loader = SWHSvnLoaderUpdateNoStorage() - self.loader.prepare( - self.svn_mirror_url, self.destination_path, self.origin) + """ + def setUp(self): + # the svn repository pkg-gourmet has been updated with changes + super().setUp(archive_name='pkg-gourmet-with-updates.tgz') + self.loader = SvnLoaderUpdateNoStorage() @istest - def process_repository(self): - """Process known repository from scratch should yield revisions again + def load(self): + """Load an existing repository from scratch yields same swh objects """ # when - self.loader.process_repository(self.origin_visit, - start_from_scratch=True) + self.loader.load(svn_url=self.svn_mirror_url, + destination_path=self.destination_path, + start_from_scratch=True) # then # we got the previous run's last revision (rev 6) @@ -395,46 +429,70 @@ self.assertRevisionsOk(expected_revisions) + self.assertEquals(len(self.loader.all_snapshots), 1) + # FIXME: Check the snapshot's state + # self.assertEquals(self.loader.all_snapshots[0], {}) + self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) + self.assertEqual(self.loader.visit_status(), 'full') -class SWHSvnLoaderUpdateWithUnfinishedLoadingChangesITTest(BaseTestSvnLoader): - def setUp(self): - super().setUp(archive_name='pkg-gourmet-with-updates.tgz') - self.origin = {'id': 2, 'type': 'svn', 'url': 'file:///dev/null'} +class SvnLoaderWithPreviousRevisionNoStorage(LoaderNoStorage, LoaderWithState, + SvnLoader): + """An SVNLoader with no persistence. - self.origin_visit = { - 'origin': self.origin['id'], - 'visit': 6 - } + Context: Load a known svn repository using the swh policy with its + history altered so we do not update it. - self.loader = SWHSvnLoaderNoStorage() - self.loader.prepare( - self.svn_mirror_url, self.destination_path, self.origin) + """ + def swh_latest_snapshot_revision(self, origin_id, prev_swh_revision=None): + """Avoid the storage persistence call and return the expected previous + revision for that repository. - @istest - def process_repository(self): - """Process partially visited repository should finish loading + Check the following for explanation about the hashes: + - test_loader.org for (swh policy). + - cf. SvnLoaderITTest """ - previous_unfinished_revision = { - 'id': hashutil.hash_to_bytes( - '4876cb10aec6f708f7466dddf547567b65f6c39c'), - 'parents': [hashutil.hash_to_bytes( - 'a3a577948fdbda9d1061913b77a1588695eadb41')], - 'directory': hashutil.hash_to_bytes( - '0deab3023ac59398ae467fc4bff5583008af1ee2'), - 'target_type': 'revision', - 'metadata': { - 'extra_headers': [ - ['svn_repo_uuid', '3187e211-bb14-4c82-9596-0b59d67cd7f4'], - ['svn_revision', '6'] - ] + return { + 'snapshot': None, + 'revision': { + 'id': hashutil.hash_to_bytes( + '4876cb10aec6f708f7466dddf547567b65f6c39c'), + 'parents': [hashutil.hash_to_bytes( + 'a3a577948fdbda9d1061913b77a1588695eadb41')], + 'directory': hashutil.hash_to_bytes( + '0deab3023ac59398ae467fc4bff5583008af1ee2'), + 'target_type': 'revision', + 'metadata': { + 'extra_headers': [ + ['svn_repo_uuid', '3187e211-bb14-4c82-9596-0b59d67cd7f4'], # noqa + ['svn_revision', '6'] + ] + } } } + + +class SvnLoaderITTest6(BaseSvnLoaderTest): + """Context: + - repository already visited with load successfull + - Changes on existing repository + - New Visit done with successful new data + + """ + def setUp(self): + super().setUp(archive_name='pkg-gourmet-with-updates.tgz') + self.loader = SvnLoaderWithPreviousRevisionNoStorage() + + @istest + def load(self): + """Load from partial previous visit result in new changes + + """ # when - self.loader.process_repository( - self.origin_visit, - last_known_swh_revision=previous_unfinished_revision) + self.loader.load( + svn_url=self.svn_mirror_url, + destination_path=self.destination_path) # then # we got the previous run's last revision (rev 6) @@ -455,27 +513,27 @@ } self.assertRevisionsOk(expected_revisions) + self.assertEquals(len(self.loader.all_snapshots), 1) + # FIXME: Check the snapshot's state + # self.assertEquals(self.loader.all_snapshots[0], {}) + self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) + self.assertEqual(self.loader.visit_status(), 'full') + +class SvnLoaderITest7(BaseSvnLoaderTest): + """Context: + - repository already visited with load successfull + - Changes on existing repository + - New Visit done with successful new data -class SWHSvnLoaderUpdateWithUnfinishedLoadingChangesButVisitDoneITTest( - BaseTestSvnLoader): + """ def setUp(self): super().setUp(archive_name='pkg-gourmet-with-updates.tgz') - - self.origin = {'id': 2, 'type': 'svn', 'url': 'file:///dev/null'} - - self.origin_visit = { - 'origin': self.origin['id'], - 'visit': 9, - } - - self.loader = SWHSvnLoaderUpdateNoStorage() - self.loader.prepare( - self.svn_mirror_url, self.destination_path, self.origin) + self.loader = SvnLoaderUpdateNoStorage() @istest - def process_repository(self): - """Process known and partial repository should start from last visit + def load(self): + """Load known and partial repository should start from last visit """ previous_unfinished_revision = { @@ -495,9 +553,10 @@ } # when - self.loader.process_repository( - self.origin_visit, - last_known_swh_revision=previous_unfinished_revision) + self.loader.load( + svn_url=self.svn_mirror_url, + destination_path=self.destination_path, + swh_revision=previous_unfinished_revision) # then # we got the previous run's last revision (rev 6) @@ -518,14 +577,18 @@ } self.assertRevisionsOk(expected_revisions) + self.assertEquals(len(self.loader.all_snapshots), 1) + # FIXME: Check the snapshot's state + # self.assertEquals(self.loader.all_snapshots[0], {}) + self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) + self.assertEqual(self.loader.visit_status(), 'full') -class SWHSvnLoaderUpdateLessRecentNoStorage(TestSvnLoader, SWHSvnLoader): - """An SWHSVNLoader with no persistence. - - Context: - Load a known svn repository using the swh policy. The last - visit seen is less recent than a previous unfinished crawl. +class SvnLoaderUpdateLessRecentNoStorage(LoaderNoStorage, LoaderWithState, + SvnLoader): + """Context: + Load a known svn repository. The last visit seen is less + recent than a previous unfinished crawl. """ def swh_latest_snapshot_revision(self, origin_id, prev_swh_revision=None): @@ -534,7 +597,7 @@ Check the following for explanation about the hashes: - test_loader.org for (swh policy). - - cf. SWHSvnLoaderITTest + - cf. SvnLoaderITTest """ return { @@ -558,25 +621,21 @@ } -class SWHSvnLoaderUnfinishedLoadingChangesSinceLastVisitITTest( - BaseTestSvnLoader): - def setUp(self): - super().setUp(archive_name='pkg-gourmet-with-updates.tgz') +class SvnLoaderITest8(BaseSvnLoaderTest): + """Context: - self.origin = {'id': 2, 'type': 'svn', 'url': 'file:///dev/null'} + - Previous visit on existing repository done + - Starting the loading from the last unfinished visit + - New objects are created (1 new snapshot) - self.origin_visit = { - 'origin': self.origin['id'], - 'visit': 1, - } - - self.loader = SWHSvnLoaderUpdateLessRecentNoStorage() - self.loader.prepare( - self.svn_mirror_url, self.destination_path, self.origin) + """ + def setUp(self): + super().setUp(archive_name='pkg-gourmet-with-updates.tgz') + self.loader = SvnLoaderUpdateLessRecentNoStorage() @istest - def process_repository(self): - """Process updated repository should yield revisions from last visit + def load(self): + """Load repository should yield revisions starting from last visit """ previous_unfinished_revision = { @@ -595,9 +654,10 @@ } } # when - self.loader.process_repository( - self.origin_visit, - last_known_swh_revision=previous_unfinished_revision) + self.loader.load( + svn_url=self.svn_mirror_url, + destination_path=self.destination_path, + swh_revision=previous_unfinished_revision) # then # we got the previous run's last revision (rev 6) @@ -618,126 +678,110 @@ } self.assertRevisionsOk(expected_revisions) + self.assertEquals(len(self.loader.all_snapshots), 1) + # FIXME: Check the snapshot's state + # self.assertEquals(self.loader.all_snapshots[0], {}) + self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) + self.assertEqual(self.loader.visit_status(), 'full') -class SWHSvnLoaderTestCornerCaseAboutCrlfEolInRepoITTest(BaseTestSvnLoader): - """ - Check that a svn repo containing a versioned file with CRLF line - endings with svn:eol-style property set to 'native' (this is - a violation of svn specification as the file should have been - stored with LF line endings) can be loaded anyway. +class SvnLoaderTTest9(BaseSvnLoaderTest): + """Check that a svn repo containing a versioned file with CRLF line + endings with svn:eol-style property set to 'native' (this is a + violation of svn specification as the file should have been + stored with LF line endings) can be loaded anyway. + """ def setUp(self): super().setUp(archive_name='mediawiki-repo-r407-eol-native-crlf.tgz', filename='mediawiki-repo-r407-eol-native-crlf') - - self.origin = {'id': 1, 'type': 'svn', - 'url': 'https://code.google.com/p/pyang/pyang-repo'} - - self.origin_visit = { - 'origin': self.origin['id'], - 'visit': 1, - } - - self.loader = SWHSvnLoaderNoStorage() - self.loader.prepare( - self.svn_mirror_url, self.destination_path, self.origin) + self.loader = SvnLoaderNoStorage() @istest def process_repository(self): - """ - Process repository with CRLF line endings and svn:eol-style set to 'native' + """Load repository with CRLF endings (svn:eol-style: native) is ok + """ # noqa # when - self.loader.process_repository(self.origin_visit) + self.loader.load(svn_url=self.svn_mirror_url, + destination_path=self.destination_path) expected_revisions = { '7da4975c363101b819756d33459f30a866d01b1b': 'f63637223ee0f7d4951ffd2d4d9547a4882c5d8b' # noqa } - self.assertRevisionsOk(expected_revisions) + self.assertEquals(len(self.loader.all_snapshots), 1) + # FIXME: Check the snapshot's state + # self.assertEquals(self.loader.all_snapshots[0], {}) + self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) + self.assertEqual(self.loader.visit_status(), 'full') -class SWHSvnLoaderTestCornerCaseAboutMixedCrlfLfEolInRepoITTest(BaseTestSvnLoader): # noqa - """ - Check that a svn repo containing a versioned file with mixed +class SvnLoaderITest10(BaseSvnLoaderTest): # noqa + """Check that a svn repo containing a versioned file with mixed CRLF/LF line endings with svn:eol-style property set to 'native' (this is a violation of svn specification as mixed line endings for textual content should not be stored when the svn:eol-style property is set) can be loaded anyway. + """ def setUp(self): - super().setUp(archive_name='pyang-repo-r343-eol-native-mixed-lf-crlf.tgz', # noqa - filename='pyang-repo-r343-eol-native-mixed-lf-crlf') - - self.origin = {'id': 1, 'type': 'svn', - 'url': 'https://code.google.com/m/mediawiki/mediawiki-repo'} # noqa - - self.origin_visit = { - 'origin': self.origin['id'], - 'visit': 1, - } - - self.loader = SWHSvnLoaderNoStorage() - self.loader.prepare( - self.svn_mirror_url, self.destination_path, self.origin) + super().setUp( + archive_name='pyang-repo-r343-eol-native-mixed-lf-crlf.tgz', + filename='pyang-repo-r343-eol-native-mixed-lf-crlf') + self.loader = SvnLoaderNoStorage() @istest - def process_repository(self): + def load(self): + """Load repo with mixed CRLF/LF endings (svn:eol-style:native) is ok + """ - Process repository with mixed CRLF/LF line endings and svn:eol-style set to 'native' - """ # noqa - self.loader.process_repository(self.origin_visit) + self.loader.load(svn_url=self.svn_mirror_url, + destination_path=self.destination_path) expected_revisions = { '9c6962eeb9164a636c374be700672355e34a98a7': '16aa6b6271f3456d4643999d234cf39fe3d0cc5a' # noqa } self.assertRevisionsOk(expected_revisions) + self.assertEquals(len(self.loader.all_snapshots), 1) + # FIXME: Check the snapshot's state + # self.assertEquals(self.loader.all_snapshots[0], {}) + self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) + self.assertEqual(self.loader.visit_status(), 'full') -class SWHSvnLoaderExternalIdCornerCaseITTest(BaseTestSvnLoader): - def setUp(self): - super().setUp(archive_name='pkg-gourmet-with-external-id.tgz') +class SvnLoaderITest11(BaseSvnLoaderTest): + """Context: - self.origin = {'id': 2, 'type': 'svn', 'url': 'file:///dev/null'} + - Repository with svn:external (which is not deal with for now) + - Visit is partial with as much data loaded as possible - self.origin_visit = { - 'origin': self.origin['id'], - 'visit': 1, - } - - self.loader = SWHSvnLoaderNoStorage() - # override revision-block size - self.loader.config['revision_packet_size'] = 3 - self.loader.prepare( - self.svn_mirror_url, self.destination_path, self.origin) + """ + def setUp(self): + super().setUp(archive_name='pkg-gourmet-with-external-id.tgz') + self.loader = SvnLoaderNoStorage() @istest - def process_repository(self): + def load(self): """Repository with svn:externals property, will stop raising an error """ previous_unfinished_revision = None # when - with self.assertRaises(SvnLoaderEventful) as exc: - self.loader.process_repository( - self.origin_visit, - last_known_swh_revision=previous_unfinished_revision) - - actual_raised_revision = exc.exception.swh_revision + self.loader.load( + svn_url=self.svn_mirror_url, + destination_path=self.destination_path, + swh_revision=previous_unfinished_revision) # then repositories holds 21 revisions, but the last commit # one holds an 'svn:externals' property which will make the - # loader-svn stops. This will then stop at the 6th iterations - # of 3-revision block size, so only 18 revisions will be - # flushed - self.assertEquals(len(self.loader.all_revisions), 18) + # loader-svn stops at the last revision prior to the bad one + self.assertEquals(len(self.loader.all_revisions), 20) self.assertEquals(len(self.loader.all_releases), 0) - last_revision = 'ffa901b69ca0f46a2261f42948838d19709cb9f8' - + last_revision = '82a7a4a09f9549223429143ba36ad77375e33c5c' expected_revisions = { # revision hash | directory hash '0d7dd5f751cef8fe17e8024f7d6b0e3aac2cfd71': '669a71cce6c424a81ba42b7dc5d560d32252f0ca', # noqa @@ -757,49 +801,43 @@ 'd04ea8afcee6205cc8384c091bfc578931c169fd': 'b0a648b02e55a4dce356ac35187a058f89694ec7', # noqa 'ded78810401fd354ffe894aa4a1e5c7d30a645d1': 'b0a648b02e55a4dce356ac35187a058f89694ec7', # noqa '4ee95e39358712f53c4fc720da3fafee9249ed19': 'c3c98df624733fef4e592bef983f93e2ed02b179', # noqa - last_revision : 'c3c98df624733fef4e592bef983f93e2ed02b179', # noqa + 'ffa901b69ca0f46a2261f42948838d19709cb9f8': 'c3c98df624733fef4e592bef983f93e2ed02b179', # noqa + '0148ae3eaa520b73a50802c59f3f416b7a36cf8c': '844d4646d6c2b4f3a3b2b22ab0ee38c7df07bab2', # noqa + last_revision: '0de6e75d2b79ec90d00a3a7611aa3861b2e4aa5e', # noqa } # The last revision being the one used later to start back from - self.assertEquals(hashutil.hash_to_hex(actual_raised_revision['id']), - last_revision) - self.assertRevisionsOk(expected_revisions) + self.assertEquals(len(self.loader.all_snapshots), 1) + # FIXME: Check the snapshot's state + self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) + self.assertEqual(self.loader.visit_status(), 'partial') + +class SvnLoaderITest12(BaseSvnLoaderTest): + """Edge cases: + - first create a file and commit it. + Remove it, then add folder holding the same name, commit. + - do the same scenario with symbolic link (instead of file) -class SWHSvnLoaderLinkFileAndFolderWithSameNameITTest(BaseTestSvnLoader): + """ def setUp(self): - # edge cases: - # - first create a file and commit it. - # Remove it, then add folder holding the same name, commit. - # - do the same scenario with symbolic link (instead of file) super().setUp( archive_name='pkg-gourmet-with-edge-case-links-and-files.tgz') - - self.origin = {'id': 2, 'type': 'svn', 'url': 'file:///dev/null'} - - self.origin_visit = { - 'origin': self.origin['id'], - 'visit': 1, - } - - self.loader = SWHSvnLoaderNoStorage() - # override revision-block size - self.loader.config['revision_packet_size'] = 3 - self.loader.prepare( - self.svn_mirror_url, self.destination_path, self.origin) + self.loader = SvnLoaderNoStorage() @istest - def process_repository(self): - """File/Link destroyed prior to folder with same name creation should be ok + def load(self): + """File/Link removed prior to folder with same name creation is ok """ previous_unfinished_revision = None # when - self.loader.process_repository( - self.origin_visit, - last_known_swh_revision=previous_unfinished_revision) + self.loader.load( + svn_url=self.svn_mirror_url, + destination_path=self.destination_path, + swh_revision=previous_unfinished_revision) # then repositories holds 14 revisions, but the last commit self.assertEquals(len(self.loader.all_revisions), 19) @@ -830,40 +868,32 @@ } self.assertRevisionsOk(expected_revisions) + self.assertEquals(len(self.loader.all_snapshots), 1) + # FIXME: Check the snapshot's state + self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) + self.assertEqual(self.loader.visit_status(), 'full') + +class SvnLoaderITTest13(BaseSvnLoaderTest): + """Edge cases: + - wrong symbolic link + - wrong symbolic link with empty space names -class SWHSvnLoaderWrongLinkCasesITTest(BaseTestSvnLoader): + """ def setUp(self): - # edge cases: - # - wrong symbolic link - # - wrong symbolic link with empty space names super().setUp( archive_name='pkg-gourmet-with-wrong-link-cases.tgz') - - self.origin = {'id': 2, 'type': 'svn', 'url': 'file:///dev/null'} - - self.origin_visit = { - 'origin': self.origin['id'], - 'visit': 1, - } - - self.loader = SWHSvnLoaderNoStorage() - # override revision-block size - self.loader.config['revision_packet_size'] = 3 - self.loader.prepare( - self.svn_mirror_url, self.destination_path, self.origin) + self.loader = SvnLoaderNoStorage() @istest - def process_repository(self): + def load(self): """Wrong link or empty space-named link should be ok """ - previous_unfinished_revision = None - # when - self.loader.process_repository( - self.origin_visit, - last_known_swh_revision=previous_unfinished_revision) + self.loader.load( + svn_url=self.svn_mirror_url, + destination_path=self.destination_path) # then repositories holds 14 revisions, but the last commit self.assertEquals(len(self.loader.all_revisions), 21) @@ -897,3 +927,7 @@ } self.assertRevisionsOk(expected_revisions) + self.assertEquals(len(self.loader.all_snapshots), 1) + # FIXME: Check the snapshot's state + self.assertEqual(self.loader.load_status(), {'status': 'eventful'}) + self.assertEqual(self.loader.visit_status(), 'full')